Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compilation fixes #70

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 17 additions & 19 deletions c_fast.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*******************

See https://github.com/logicchains/LPATHBench

The edges are stored as an array of pointers to arrays of edges, i.e: the edges
Expand All @@ -34,13 +34,13 @@
maximize the density of useful data in caches
* it achieves performance almost as good as a statically sized matrix, while
allowing dynamic sizing

This has only been optimised for Tegra K1, a Cortex-A15-based SoC when compiled
with gcc 4.8.2. Performance seems highly dependent on code alignment.

Peformance (with provided benchmark graph): around 10% speedup compared to cpp,
and around 15% compared to C/HIGHBIT.

Note that C/HIGHBIT has been observed to be faster on a second sparse graph with
35 nodes.
*/
Expand Down Expand Up @@ -82,23 +82,22 @@ void parse_graph(edge_t ***costs_p, int *no_of_nodes_p) {
int target_node;
int cost;
int index = 0;
int wp;
int no_of_nodes;
edge_t **costs;

f = fopen("agraph", "r");
assert(f != NULL);

ret = fscanf(f, "%d", no_of_nodes_p);
assert (ret == 1);
no_of_nodes = *no_of_nodes_p;

costs = malloc(sizeof(edge_t*) * no_of_nodes);
assert(costs != NULL);
*costs_p = costs;
edge_t *buf = malloc(sizeof(edge_t) * no_of_nodes);
assert(buf != NULL);

while(fscanf(f, "%d %d %d\n", &c_node, &target_node, &cost) == 3) {
assert((c_node == prev_node || c_node == (prev_node + 1)) && c_node < no_of_nodes && cost >= 0);
if (c_node != prev_node) {
Expand All @@ -111,17 +110,17 @@ void parse_graph(edge_t ***costs_p, int *no_of_nodes_p) {
}
buf[target_node].target = target_node;
buf[target_node].cost = cost;

index++;
}

insert_node_edges(costs, prev_node, buf, no_of_nodes, index);
}

int get_max_cost_small(edge_t **c, const int c_node, uint32_t visited) {
int max = 0;
int dist;

visited |= 1 << c_node;
for (int index = 0; c[c_node][index].cost >= 0; index++) {
if (!(visited & (1 << c[c_node][index].target))) {
Expand All @@ -130,15 +129,15 @@ int get_max_cost_small(edge_t **c, const int c_node, uint32_t visited) {
}
}
visited &= ~(1 << c_node);

return max;
}

int get_max_cost(edge_t **c, const int c_node, uint32_t *visited) {
int max = 0;
int dist;
int target;

visited[c_node >> 5] |= 1 << (c_node & 0x1f);
for (int index = 0; c[c_node][index].cost >= 0; index++) {
target = c[c_node][index].target;
Expand All @@ -148,7 +147,7 @@ int get_max_cost(edge_t **c, const int c_node, uint32_t *visited) {
}
}
visited[c_node >> 5] &= ~(1 << (c_node & 0x1f));

return max;
}

Expand All @@ -159,7 +158,7 @@ int main() {
struct timeval start, end, duration;
edge_t **costs;
int no_of_nodes;

parse_graph(&costs, &no_of_nodes);

gettimeofday(&start, NULL);
Expand All @@ -171,9 +170,8 @@ int main() {
result = get_max_cost_small(costs, 0, 0);
}
gettimeofday(&end, NULL);

timersub(&end, &start, &duration);
ms = duration.tv_sec*1000 + duration.tv_usec/1000;
printf("%d LANGUAGE C-fast %llu\n", result, ms);
printf("%d LANGUAGE C-fast %lu\n", result, ms);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's incorrect on 32-bit architectures (including Aarch32 and X86), on which a long unsigned int is a uint32_t.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But ms has type uint64_t!

Copy link
Contributor

@lgeek lgeek Jun 11, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry if that was not clear. ms has type uint64_t, which is why it has to be printed with %llu (which is a 64-bit unsigned integer both on 32-bit and 64-bit machines), and not with %lu (which is a 32-bit unsigned integer on (some) 32-bit machines and a 64-bit unsigned integer on 64-bit machines). These implementations are supposed to run on AArch32 and x86-64.

}

32 changes: 20 additions & 12 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,35 +1,43 @@
NUM_NODES = 10
WORLD_SIZE = 1000

COMMON_CFLAGS = -std=gnu99 -O2 -march=native -fomit-frame-pointer -Wall -Wextra
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-march=native is not equivalent to -mcpu=native. It needs to be -march=native -mtune=native.

Copy link
Author

@bjourne bjourne Jun 11, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

@lgeek lgeek Jun 11, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not documented to work like that in the GCC ARM docs. Also, the options accepted by -march are generic architectures, while the options accepted by -mtune are specific cores.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But previously the makefile didn't even specify -mtune. -march at least implies -mcpu.

COMMON_CXXFLAGS = -std=c++14 -Wall -O2 -march=native

buildall: c_fast c_fast_arm f03 c fsharp cpp_gcc cpp_clang cpp_cached racket csharp java haskell ocaml lisp rust rust_unsafe go gccgo d nim oraclejava crystal

clean:
rm -f c_fast_arm c_fast f03 fs.exe cpp_gcc cpp_clang cpp_plain cpp_cached \
cs.exe jv.class hs ml lisp rs rs_unsafe go gccgo d nim crystal d \
c
# C targets
c: c.c
$(CC) $(COMMON_CFLAGS) c.c -o c -DUSE_HIGHBIT

c_fast_arm: c_fast.c
gcc -marm -falign-functions=32 -g -std=gnu99 -O2 -mcpu=native -fomit-frame-pointer c_fast.c -o ./c_fast_arm
$(CC) -marm -falign-functions=32 $(COMMON_CFLAGS) c_fast.c -o ./c_fast_arm

c_fast: c_fast.c
gcc -falign-functions=32 -g -std=gnu99 -O2 -mcpu=native -fomit-frame-pointer c_fast.c -o ./c_fast
$(CC) -falign-functions=32 $(COMMON_CFLAGS) c_fast.c -o ./c_fast
Copy link
Contributor

@lgeek lgeek Jun 11, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The compiler was hardcoded to gcc on purpose, because I haven't evaluated or optimised clang's output for this implementation and therefore the code or flags might need some tweaks to achieve the optimum performance. I don't insist on keeping it that way, but maybe adding a comment about that would be in order.


# Other
f03: f03.f03
gfortran -O2 -mcpu=native f03.f03 -o f03

fsharp: fs.fs
fsharpc fs.fs

cpp_gcc: cpp.cpp
g++ cpp.cpp -std=c++14 -Wall -O2 -mcpu=native -DCOMPILER='"gcc"' -o cpp_gcc
g++ cpp.cpp $(COMMON_CXXFLAGS) -DCOMPILER='"gcc"' -o cpp_gcc

cpp_clang: cpp.cpp
clang++ cpp.cpp -std=c++14 -Wall -O2 -mcpu=native -DCOMPILER='"clang"' -o cpp_clang
clang++ cpp.cpp $(COMMON_CXXFLAGS) -DCOMPILER='"clang"' -o cpp_clang

cpp_plain: cpp_plain.cpp
clang++ cpp_plain.cpp -std=c++14 -Wall -O2 -mcpu=native -DCOMPILER='"clang"' -o cpp_plain
clang++ cpp_plain.cpp $(COMMON_CXXFLAGS) -DCOMPILER='"clang"' -o cpp_plain

cpp_cached: cpp_cached.cpp
clang++ cpp_cached.cpp -std=c++14 -Wall -O2 -mcpu=native -o cpp_cached

c: c.c
gcc -g -std=gnu99 -Wall -Wextra c.c -O2 -mcpu=native -o c -DUSE_HIGHBIT
clang++ cpp_cached.cpp $(COMMON_CXXFLAGS) -o cpp_cached

racket: rkt.rkt
raco exe rkt.rkt
Expand Down Expand Up @@ -59,7 +67,7 @@ rust: rs.rs
rustc rs.rs --opt-level=3 -C no-stack-check

rust_unsafe: rs_unsafe.rs
rustc rs_unsafe.rs --opt-level=3
rustc rs_unsafe.rs --opt-level=3

go: go.go
go build go.go
Expand All @@ -80,7 +88,7 @@ nim: nim.nim
nim c --cc:clang --passC:-mcpu=native -d:release nim.nim

scala: scala.scala
scalac scala.scala
scalac scala.scala

graphbuilder: mkgraph.go
go build mkgraph.go
Expand Down