diff --git a/c_fast.c b/c_fast.c
index 1b7e080..111240a 100644
--- a/c_fast.c
+++ b/c_fast.c
@@ -24,7 +24,7 @@
    IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    *******************
-   
+
    See https://github.com/logicchains/LPATHBench
 
    The edges are stored as an array of pointers to arrays of edges, i.e: the edges
@@ -34,13 +34,13 @@
         maximize the density of useful data in caches
      * it achieves performance almost as good as a statically sized matrix, while
        allowing dynamic sizing
-       
+
    This has only been optimised for Tegra K1, a Cortex-A15-based SoC when compiled
    with gcc 4.8.2. Performance seems highly dependent on code alignment.
-   
+
    Peformance (with provided benchmark graph): around 10% speedup compared to cpp,
    and around 15% compared to C/HIGHBIT.
-   
+
    Note that C/HIGHBIT has been observed to be faster on a second sparse graph with
    35 nodes.
 */
@@ -82,23 +82,22 @@ void parse_graph(edge_t ***costs_p, int *no_of_nodes_p) {
   int target_node;
   int cost;
   int index = 0;
-  int wp;
   int no_of_nodes;
   edge_t **costs;
-  
+
   f = fopen("agraph", "r");
   assert(f != NULL);
-  
+
   ret = fscanf(f, "%d", no_of_nodes_p);
   assert (ret == 1);
   no_of_nodes = *no_of_nodes_p;
-  
+
   costs = malloc(sizeof(edge_t*) * no_of_nodes);
   assert(costs != NULL);
   *costs_p = costs;
   edge_t *buf = malloc(sizeof(edge_t) * no_of_nodes);
   assert(buf != NULL);
-  
+
   while(fscanf(f, "%d %d %d\n", &c_node, &target_node, &cost) == 3) {
     assert((c_node == prev_node || c_node == (prev_node + 1)) && c_node < no_of_nodes && cost >= 0);
     if (c_node != prev_node) {
@@ -111,17 +110,17 @@ void parse_graph(edge_t ***costs_p, int *no_of_nodes_p) {
     }
     buf[target_node].target = target_node;
     buf[target_node].cost = cost;
- 
+
     index++;
   }
-  
+
   insert_node_edges(costs, prev_node, buf, no_of_nodes, index);
 }
 
 int get_max_cost_small(edge_t **c, const int c_node, uint32_t visited) {
   int max = 0;
   int dist;
-  
+
   visited |= 1 << c_node;
   for (int index = 0; c[c_node][index].cost >= 0; index++) {
     if (!(visited & (1 << c[c_node][index].target))) {
@@ -130,7 +129,7 @@ int get_max_cost_small(edge_t **c, const int c_node, uint32_t visited) {
     }
   }
   visited &= ~(1 << c_node);
-  
+
   return max;
 }
 
@@ -138,7 +137,7 @@ int get_max_cost(edge_t **c, const int c_node, uint32_t *visited) {
   int max = 0;
   int dist;
   int target;
-  
+
   visited[c_node >> 5] |= 1 << (c_node & 0x1f);
   for (int index = 0; c[c_node][index].cost >= 0; index++) {
     target = c[c_node][index].target;
@@ -148,7 +147,7 @@ int get_max_cost(edge_t **c, const int c_node, uint32_t *visited) {
     }
   }
   visited[c_node >> 5] &= ~(1 << (c_node & 0x1f));
-  
+
   return max;
 }
 
@@ -159,7 +158,7 @@ int main() {
   struct timeval start, end, duration;
   edge_t **costs;
   int no_of_nodes;
-  
+
   parse_graph(&costs, &no_of_nodes);
 
   gettimeofday(&start, NULL);
@@ -171,9 +170,8 @@ int main() {
     result = get_max_cost_small(costs, 0, 0);
   }
   gettimeofday(&end, NULL);
-  
+
   timersub(&end, &start, &duration);
   ms = duration.tv_sec*1000 + duration.tv_usec/1000;
-  printf("%d LANGUAGE C-fast %llu\n", result, ms);
+  printf("%d LANGUAGE C-fast %lu\n", result, ms);
 }
-
diff --git a/makefile b/makefile
index 918a222..0dda4cf 100644
--- a/makefile
+++ b/makefile
@@ -1,15 +1,26 @@
 NUM_NODES = 10
 WORLD_SIZE = 1000
 
+COMMON_CFLAGS = -std=gnu99 -O2 -march=native -fomit-frame-pointer -Wall -Wextra
+COMMON_CXXFLAGS = -std=c++14 -Wall -O2 -march=native
 
 buildall: c_fast c_fast_arm f03 c fsharp cpp_gcc cpp_clang cpp_cached racket csharp java haskell ocaml lisp rust rust_unsafe go gccgo d nim oraclejava crystal
 
+clean:
+	rm -f c_fast_arm c_fast f03 fs.exe cpp_gcc cpp_clang cpp_plain cpp_cached \
+		cs.exe jv.class hs ml lisp rs rs_unsafe go gccgo d nim crystal d \
+		c
+# C targets
+c: c.c
+	$(CC) $(COMMON_CFLAGS) c.c -o c -DUSE_HIGHBIT
+
 c_fast_arm: c_fast.c
-	gcc -marm -falign-functions=32 -g -std=gnu99 -O2 -mcpu=native -fomit-frame-pointer c_fast.c -o ./c_fast_arm
-	
+	$(CC) -marm -falign-functions=32 $(COMMON_CFLAGS) c_fast.c -o ./c_fast_arm
+
 c_fast: c_fast.c
-	gcc -falign-functions=32 -g -std=gnu99 -O2 -mcpu=native -fomit-frame-pointer c_fast.c -o ./c_fast
+	$(CC) -falign-functions=32 $(COMMON_CFLAGS) c_fast.c -o ./c_fast
 
+# Other
 f03:    f03.f03
 	gfortran -O2 -mcpu=native f03.f03 -o f03
 
@@ -17,19 +28,16 @@ fsharp: fs.fs
 	fsharpc fs.fs
 
 cpp_gcc: cpp.cpp
-	g++ cpp.cpp -std=c++14 -Wall -O2 -mcpu=native -DCOMPILER='"gcc"' -o cpp_gcc
+	g++ cpp.cpp $(COMMON_CXXFLAGS) -DCOMPILER='"gcc"' -o cpp_gcc
 
 cpp_clang: cpp.cpp
-	clang++ cpp.cpp -std=c++14 -Wall -O2 -mcpu=native -DCOMPILER='"clang"' -o cpp_clang
+	clang++ cpp.cpp $(COMMON_CXXFLAGS) -DCOMPILER='"clang"' -o cpp_clang
 
 cpp_plain: cpp_plain.cpp
-	clang++ cpp_plain.cpp -std=c++14 -Wall -O2 -mcpu=native -DCOMPILER='"clang"' -o cpp_plain
+	clang++ cpp_plain.cpp $(COMMON_CXXFLAGS) -DCOMPILER='"clang"' -o cpp_plain
 
 cpp_cached: cpp_cached.cpp
-	clang++ cpp_cached.cpp -std=c++14 -Wall -O2 -mcpu=native -o cpp_cached
-
-c: c.c
-	gcc -g -std=gnu99 -Wall -Wextra c.c  -O2 -mcpu=native -o c -DUSE_HIGHBIT
+	clang++ cpp_cached.cpp $(COMMON_CXXFLAGS) -o cpp_cached
 
 racket: rkt.rkt
 	raco exe rkt.rkt
@@ -59,7 +67,7 @@ rust: rs.rs
 	rustc rs.rs --opt-level=3 -C no-stack-check
 
 rust_unsafe: rs_unsafe.rs
-	rustc rs_unsafe.rs --opt-level=3	
+	rustc rs_unsafe.rs --opt-level=3
 
 go: go.go
 	go build go.go
@@ -80,7 +88,7 @@ nim: nim.nim
 	nim c --cc:clang --passC:-mcpu=native -d:release nim.nim
 
 scala: scala.scala
-	scalac scala.scala 
+	scalac scala.scala
 
 graphbuilder: mkgraph.go
 	go build mkgraph.go