-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile.nvlink
87 lines (70 loc) · 4.8 KB
/
Makefile.nvlink
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# When building a library with CUDA code we should support diverse scenarios,
# where the CUDA code in the library may be used by:
# - host code in the library itself (i.e. a kernel defined and used within the library)
# - host code in a user program or library (i.e. a kernel defined in the library)
# - CUDA code in a user program or library (i.e. __device__ code in the library)
.PHONY: all clean
BUILD:=build$(suffix $(word 1, $(MAKEFILE_LIST)))
CUDA_BASE=/usr/local/cuda-9.2
CUDA_ARCH=sm_35 sm_50 sm_61
CUDA_CXXFLAGS=$(foreach ARCH,$(CUDA_ARCH),-gencode arch=$(ARCH:sm_%=compute_%),code=$(ARCH))
CUDA_DEFINES=-D__CUDACC_VER_MAJOR__=9 -D__CUDACC_VER_MINOR__=2 -D__CUDACC_VER_BUILD__=88
HOST_CXX=g++
HOST_CXXFLAGS=-fPIC -Wall
CXXFLAGS=-std=c++14 -O2 -I$(CUDA_BASE)/include -I.
LDFLAGS=-L$(BUILD)
CUDA_LDFLAGS=-L$(CUDA_BASE)/lib64/stubs -L$(CUDA_BASE)/lib64 -lcudadevrt -lcudart -lrt -lpthread -ldl
CUDA_DEVICE_LDFLAGS=-L$(CUDA_BASE)/lib64/stubs -L$(CUDA_BASE)/lib64 -lcudadevrt
PARTICLE_OBJECTS = $(BUILD)/particle/particle.o $(BUILD)/particle/v3.o
PROPAGATE_OBJECTS = $(BUILD)/propagate/propagate.o
all: $(BUILD)/app
clean:
rm -rf $(BUILD)
$(BUILD)/%.o: %.cc
@mkdir -p $(dir $@)
# compile individual host-only files
@#$(CUDA_BASE)/bin/nvcc -x c++ $(CXXFLAGS) $(CUDA_CXXFLAGS) -ccbin $(HOST_CXX) -Xcompiler '$(HOST_CXXFLAGS)' -c $< -o $@
$(HOST_CXX) $(CXXFLAGS) $(HOST_CXXFLAGS) -c $< -o $@
$(BUILD)/%.o $(BUILD)/%.cuda.o: %.cu
@mkdir -p $(dir $@)
# compile individual host/device files
$(CUDA_BASE)/bin/nvcc -x cu $(CXXFLAGS) $(CUDA_CXXFLAGS) -ccbin $(HOST_CXX) -Xcompiler '$(HOST_CXXFLAGS)' -dc $< -o $@
# extract CUDA fatbin to a separate object file
objcopy -j '.nv*' -j '__nv*' $(BUILD)/$*.o $(BUILD)/$*.cuda.o
$(BUILD)/libparticle.cuda.a: $(PARTICLE_OBJECTS:.o=.cuda.o)
@mkdir -p $(dir $@)
# build a static library with the device code
ar crs $@ $^
$(BUILD)/libparticle.so: $(PARTICLE_OBJECTS)
@mkdir -p $(dir $@)
# build a shared library with the host and device code
@$(eval TMPDIR:=$(shell mktemp -d))
@#$(CUDA_BASE)/bin/nvcc $(CUDA_CXXFLAGS) -Xcompiler '$(HOST_CXXFLAGS)' -dlink $(filter %.o,$^) $(LDFLAGS) -o $(TMPDIR)/$(basename $(@F)).o
for ARCH in $(CUDA_ARCH); do \
$(CUDA_BASE)/bin/nvlink --arch=$$ARCH --register-link-binaries=$(TMPDIR)/$(basename $(@F)).reg.c -m64 -cpu-arch=X86_64 $(filter %.o,$^) $(LDFLAGS) $(CUDA_DEVICE_LDFLAGS) -o $(TMPDIR)/$(basename $(@F)).$$ARCH.cubin; \
done
$(CUDA_BASE)/bin/fatbinary --bin2c-path=$(CUDA_BASE)/bin --create=$(TMPDIR)/$(basename $(@F)).fatbin -64 --cmdline=--compile-only -link $(foreach ARCH,$(CUDA_ARCH),--image=profile=$(ARCH),file=$(TMPDIR)/$(basename $(@F)).$(ARCH).cubin) --embedded-fatbin=$(TMPDIR)/$(basename $(@F)).fatbin.c
$(HOST_CXX) -x c++ $(CXXFLAGS) $(HOST_CXXFLAGS) $(CUDA_DEFINES) -D__NV_EXTRA_INITIALIZATION= -D__NV_EXTRA_FINALIZATION= -DFATBINFILE=\"$(TMPDIR)/$(basename $(@F)).fatbin.c\" -DREGISTERLINKBINARYFILE=\"$(TMPDIR)/$(basename $(@F)).reg.c\" -c $(CUDA_BASE)/bin/crt/link.stub -o $(TMPDIR)/$(basename $(@F)).o
$(HOST_CXX) -shared $(HOST_CXXFLAGS) $^ $(TMPDIR)/$(basename $(@F)).o $(LDFLAGS) -o $@
@rm -rf $(TMPDIR)
$(BUILD)/libpropagate.cuda.a: $(PROPAGATE_OBJECTS:.o=.cuda.o)
@mkdir -p $(dir $@)
# build a static library with the device code
ar crs $@ $^
$(BUILD)/libpropagate.so: $(PROPAGATE_OBJECTS) $(BUILD)/libparticle.cuda.a $(BUILD)/libparticle.so
@mkdir -p $(dir $@)
# build a shared library with the host and device code
@$(eval TMPDIR:=$(shell mktemp -d))
@#$(CUDA_BASE)/bin/nvcc $(CUDA_CXXFLAGS) -Xcompiler '$(HOST_CXXFLAGS)' -dlink $(filter %.o,$^) $(LDFLAGS) -lparticle.cuda -o $(TMPDIR)/$(basename $(@F)).o
for ARCH in $(CUDA_ARCH); do \
$(CUDA_BASE)/bin/nvlink --arch=$$ARCH --register-link-binaries=$(TMPDIR)/$(basename $(@F)).reg.c -m64 -cpu-arch=X86_64 $(filter %.o,$^) $(LDFLAGS) -lparticle.cuda $(CUDA_DEVICE_LDFLAGS) -o $(TMPDIR)/$(basename $(@F)).$$ARCH.cubin; \
done
$(CUDA_BASE)/bin/fatbinary --bin2c-path=$(CUDA_BASE)/bin --create=$(TMPDIR)/$(basename $(@F)).fatbin -64 --cmdline=--compile-only -link $(foreach ARCH,$(CUDA_ARCH),--image=profile=$(ARCH),file=$(TMPDIR)/$(basename $(@F)).$(ARCH).cubin) --embedded-fatbin=$(TMPDIR)/$(basename $(@F)).fatbin.c
$(HOST_CXX) -x c++ $(CXXFLAGS) $(HOST_CXXFLAGS) $(CUDA_DEFINES) -D__NV_EXTRA_INITIALIZATION= -D__NV_EXTRA_FINALIZATION= -DFATBINFILE=\"$(TMPDIR)/$(basename $(@F)).fatbin.c\" -DREGISTERLINKBINARYFILE=\"$(TMPDIR)/$(basename $(@F)).reg.c\" -c $(CUDA_BASE)/bin/crt/link.stub -o $(TMPDIR)/$(basename $(@F)).o
$(HOST_CXX) -shared $(HOST_CXXFLAGS) $^ $(TMPDIR)/$(basename $(@F)).o $(LDFLAGS) -lparticle -o $@
@rm -rf $(TMPDIR)
# link the main executable
$(BUILD)/app: $(BUILD)/main.o $(BUILD)/libparticle.so $(BUILD)/libpropagate.so
@mkdir -p $(dir $@)
# call the host linker to build the executable
$(HOST_CXX) $(BUILD)/main.o $(LDFLAGS) -lpropagate -lparticle $(CUDA_LDFLAGS) -o $@