forked from broadinstitute/gdctools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
242 lines (211 loc) · 9.18 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
include ../Makefile.inc
# ---------------------------- Macro definitions ---------------------------
# Recall that TEST_ROOT and TEST_CONFIG_FILE are defined in Makefile.inc
SRC=../gdctools
PYTHON := set -o pipefail && env PYTHONPATH=.. $(PYTHON)
SORT=env LC_COLLATE=C sort
#VERBOSITY=-V
CONFIG=--config $(TEST_CONFIG_FILE) $(VERBOSITY)
PROJECT_SUBSET=--projects TCGA-ACC TCGA-SKCM TCGA-ESCA TCGA-BLCA
# To improve readibility of targets definitions below, we define commands to
# encapsulate mirroring/dicing/file finding/loadfile parsing, etc operations
PERFORM_MIRROR=$(PYTHON) $(SRC)/gdc_mirror.py $(CONFIG)
FIND_MIRROR_FILES=cd $(TEST_ROOT)/mirror/TCGA && find TCGA-* \
-name '*.gz' -o -name '*.xml' -o -name '*.txt'
PERFORM_DICE=$(PYTHON) $(SRC)/gdc_dice.py $(CONFIG) $(PROJECT_SUBSET)
FIND_DICE_FILES=cd $(TEST_ROOT)/dice/TCGA && find TCGA-* -name '*.txt'
FIND_LEGACY_FILES=cd legacy/mirror/TCGA && find TCGA-* -name '*.*' -a ! -name '*.json'
PERFORM_REPORT=$(PYTHON) $(SRC)/gdc_report.py $(CONFIG) $(PROJECT_SUBSET)
PERFORM_LOADF=$(PYTHON) $(SRC)/gdc_loadfile.py $(LOADFILE_FORMAT) $(CONFIG) $(PROJECT_SUBSET)
ifeq ($(LOADFILE_DESTINATION),google)
# We generate Google-bucketized loadfiles with an additional config file
CONFIG := --config $(TEST_CONFIG_FILE) ../gdctools/config/google.cfg $(VERBOSITY)
LOADFILE_SUBDIR=google/
LOADFILE_MD5=load-md5sums-google.txt
FILE_PREFIX=$(shell grep FILE_PREFIX: ../gdctools/config/google.cfg | awk '{print $$NF}')
# In this mode we also generate FireCloud-style loadfiles, including cases/participants
CHECK_CASES_LOADFILE=diff -b baselines/TCGA-ACCSKCM.Participant.loadfile.txt $(LOADFILES_DIR)/.
else
LOADFILE_SUBDIR=
LOADFILE_MD5=load-md5sums.txt
FILE_PREFIX=$(TEST_ROOT)
LOADFILE_FORMAT=--format=firehose
endif
LOADFILES_DIR=$(TEST_ROOT)/loadfiles/$(LOADFILE_SUBDIR)TCGA/latest
# Note: Participant loadfiles are excluded b/c they are NOT generated for legacy/firehose format
FIND_LOAD_FILES=cd $(LOADFILES_DIR) && find . -name 'TCGA-*loadfile.txt' ! -name '*Participant*'
FILE_PREFIX_LENGTH=$(shell printf $(FILE_PREFIX) | wc -c | awk '{print $$NF}')
# The goal of VALIDATE_FILE_PREFIX is to ensure that data files listed within
# the loadfile have correct paths, either to local filesystem or cloud buckets
VALIDATE_FILE_PREFIX=cd $(LOADFILES_DIR) && \
FILE_PATH=`sed -n '2,2p' TCGA-ACC.Sample.loadfile.txt | cut -f6` && \
ACTUAL_PREFIX=`echo $$FILE_PATH | cut -c-$(FILE_PREFIX_LENGTH)` && \
if [ $$ACTUAL_PREFIX != $(FILE_PREFIX) ] ; then \
echo "File paths in loadfiles must begin with $(FILE_PREFIX)" ; \
false ; \
fi
ENSURE_FAILURE_EXIT_CODE=\
if (($$Result)) ; then \
echo "Pass: aborted with exit code $$Result" ; \
else \
echo "Fail: returned 0 exit code , but should've returned non-zero" ; \
false ; \
fi
# --------------------------- Target definitions ---------------------------
help:
@echo
@echo "Run various GDCtools tests. Requires GNUmake 3.81 or later"
@echo
@echo "Targets:"
@echo
@echo "1. test Exercise tests for this package"
@echo "2. install Install locally, using pip"
@echo "3. uninstall Remove local install, using pip"
@echo "4. publish Submit to PyPI"
@echo
test: setup test_smoke test_dice test_loadfiles test_legacy test_report echo_success
test_smoke: setup echo_ver test_invoke test_mirror test_redo_mirror test_badcfg \
test_cases test_choose
setup:
mkdir -p $(TEST_ROOT)
test_invoke: setup
@echo
@echo Test runnability: invoke some tools to show nothing thrown to stderr
@$(PYTHON) $(SRC)/GDCcore.py >/dev/null
@$(PYTHON) $(SRC)/GDCtool.py >/dev/null
@$(PYTHON) $(SRC)/gdc_mirror.py --help >/dev/null
@$(PYTHON) $(SRC)/gdc_list.py --help >/dev/null
@echo Test assorted features or API calls directly, instead of via tools
@$(PYTHON) misctests.py > $(TEST_ROOT)/misctests.txt
diff $(TEST_ROOT)/misctests.txt baselines/misctests.txt
test_mirror:
@echo
@echo "Test mirror: download small set of data, compare to baselines"
@$(PERFORM_MIRROR) 2>&1 | tee [email protected] | egrep "GDC|Mirroring data|Mirroring start"
$(ABORT_ON_ERROR) [email protected]
@# Now see that the named set of mirrored files matches what we expect
$(FIND_MIRROR_FILES) | $(SORT) > $(TEST_ROOT)/mirror-files.txt
diff -b baselines/mirror-files.txt $(TEST_ROOT)/.
@# Verify integrity (but using our stored MD5s, not those just downloaded)
$(FIND_MIRROR_FILES) -exec $(MD5) {} \; | $(SORT) > $(TEST_ROOT)/mirror-md5sums.txt
diff -b baselines/mirror-md5sums.txt $(TEST_ROOT)/.
test_redo_mirror:
@echo
@echo Test retry of mirror: nothing should be re-downloaded
@$(PERFORM_MIRROR) 2>&1 | grep -w new | grep -vl " [^0][0-9]* new " >/dev/null
test_badcfg:
@echo
@echo Test that attempting to use bad config file quickly aborts with error
@touch bad.cfg
@$(PYTHON) ../gdctools/gdc_mirror.py --config bad.cfg || Result=$$? ; \
$(ENSURE_FAILURE_EXIT_CODE)
@echo
@echo Now test that given bad program/project names also quickly aborts
@$(PYTHON) ../gdctools/gdc_mirror.py --programs DUMMY || Result=$$? ; \
$(ENSURE_FAILURE_EXIT_CODE)
@echo
@$(PYTHON) ../gdctools/gdc_mirror.py --projects DUMMY || Result=$$? ; \
$(ENSURE_FAILURE_EXIT_CODE)
@echo
@$(PERFORM_DICE) --programs DUMMY 2>&1 | grep -i error || Result=$$? ; \
$(ENSURE_FAILURE_EXIT_CODE)
@echo
@$(PERFORM_LOADF) --projects DUMMY 2>&1 | grep -i error || Result=$$? ; \
$(ENSURE_FAILURE_EXIT_CODE)
@echo
@echo Induce failure in gdc_list, by giving bad input
@$(PYTHON) $(SRC)/gdc_list.py DUMMY 2>&1 || Result=$$? ; \
$(ENSURE_FAILURE_EXIT_CODE)
test_choose:
@echo
@echo Test that replicate filter is choosing the appropriate aliquots
@$(PYTHON) testchoose.py
test_dice:
@echo
@echo Test dice: on subset of cohorts, to show CLI args override config file
$(PERFORM_DICE) 2>&1 | tee [email protected] | egrep "Dicing TCGA|date"
$(ABORT_ON_ERROR) [email protected]
$(FIND_DICE_FILES) | $(SORT) > $(TEST_ROOT)/dice-files.txt
diff -b baselines/dice-files.txt $(TEST_ROOT)/.
@# Verify integrity (but using our stored MD5s, not those just downloaded)
$(FIND_DICE_FILES) -exec $(MD5) {} \; | $(SORT) > $(TEST_ROOT)/dice-md5sums.txt
diff -b baselines/dice-md5sums.txt $(TEST_ROOT)/.
test_loadfiles:
# By default GDCtools generates firecloud-style loadfiles, so exercise other
@$(EMAKE) test_loadfile
@echo
@$(EMAKE) test_loadfile LOADFILE_DESTINATION=google
test_loadfile:
@echo
@echo "Test $(LOADFILE_DESTINATION) loadfile generation ..."
$(PERFORM_LOADF) 2>&1 | tee [email protected] | egrep "Generating|date"
$(ABORT_ON_ERROR) [email protected]
# First, ensure that data files are prefixed with appropriate path
$(VALIDATE_FILE_PREFIX)
# Now compare names of files
$(FIND_LOAD_FILES) | $(SORT) > $(TEST_ROOT)/load-files.txt
diff -b baselines/load-files.txt $(TEST_ROOT)/.
# Now ensure that replicate etc filter worked
diff -b baselines/TCGA.filtered_samples.txt $(LOADFILES_DIR)/.
# And check that case/participants loadfile is OK, if relevant
$(CHECK_CASES_LOADFILE)
# Lastly, MD5 compare content after removing local path prefix to files
$(FIND_LOAD_FILES) -exec sed 's|$(FILE_PREFIX)/||g' {} \; \
| $(SORT) > $(TEST_ROOT)/load-munged.txt
$(MD5) $(TEST_ROOT)/load-munged.txt | sed 's|$(TEST_ROOT)/||g' \
> $(TEST_ROOT)/$(LOADFILE_MD5)
diff -b baselines/$(LOADFILE_MD5) $(TEST_ROOT)/.
Rscript=$(shell type -P Rscript 2>/dev/null)
test_report:
@echo
@if [ -z "$(Rscript)" ] ; then \
echo "R / Rscript not installed on your system, skipping test_report" ;\
else \
echo "Test sample report generation ..." ; \
$(PERFORM_REPORT) 2>&1 | tee [email protected] | egrep "Generating|date" ; \
$(ABORT_ON_ERROR) [email protected] ; \
(cd $(TEST_ROOT)/reports/latest && find . -name '*.html' | \
$(SORT) > $(TEST_ROOT)/report-files.txt) ; \
diff -b baselines/report-files.txt $(TEST_ROOT)/. ; \
fi
test_cases:
@echo
@echo "Test fine-grained retrieval of 1 case (and 1 data category)"
@$(PYTHON) $(SRC)/gdc_mirror.py --config onlycases.cfg 2>&1 \
--categories Biospecimen | tee [email protected] | \
egrep "GDC|Mirroring data|Mirroring start|categorie"
$(ABORT_ON_ERROR) [email protected]
@cd onlycases/mirror/TCGA && find TCGA-* \
-name '*.*' -a ! -name '*.json' | \
$(SORT) > ../../onlycases-files.txt
diff -b baselines/onlycases-files.txt onlycases/.
test_legacy:
@echo
@echo "Test legacy data download (for 1 case and 3 data categories)"
@$(PYTHON) ../gdctools/gdc_mirror.py --config legacy.cfg 2>&1 | \
tee [email protected] | \
egrep "GDC|Mirroring data|Mirroring start"
$(ABORT_ON_ERROR) [email protected]
$(FIND_LEGACY_FILES) | $(SORT) > ../../legacy-files.txt
diff -b baselines/legacy-files.txt legacy/.
$(FIND_LEGACY_FILES) -exec $(MD5) {} \; | $(SORT) > ../../legacy-md5sums.txt
diff -b baselines/legacy-md5sums.txt legacy/.
test3:
$(MAKE) -e PYTHON_VER=3 test
VERTEST="import gdctools as g; print('Version: ' + g.GDCcore.GDCT_VERSION)"
testl:
@# Test the package locally, as if it were installed
@$(PYTHON) -c $(VERTEST)
testi:
@# Test the installed package
@(cd /tmp ; $(PYTHON) -c $(VERTEST))
.PHONY: test clean echo_success test_mirror test_dice test_redo_mirror
echo_success:
@echo
@echo Success!
echo_ver:
@echo Using $(PYTHON_HOME)/bin/$(PYTHON_EXE) ...
clean:
rm -rf build dist *.egg-info *~ test_*.log bad.cfg
rclean: clean
\rm -rf $(TEST_ROOT) GDCtool gdctools_tmp onlycases legacy