-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
333 lines (247 loc) · 13 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
##########################################################################################
# Makefile for impresso language identification
#
# Note: Processing is done on locally stored data, not directly on s3 storage.
##########################################################################################
# Make setup
SHELL := /bin/bash
export SHELLOPTS := errexit:pipefail
.SECONDARY:
.PHONY: impresso-lid impresso-lid-eval impresso-lid-stage1a-target impresso-lid-stage1b-target impresso-lid-stage2-target impresso-lid-upload-release-to-s3 impresso-lid-eval
# Defines local variables if file exists
# See README.md for details
-include Makefile.local.mk
# Help target for displaying usage information
help:
@echo "Usage: make <target>"
@echo "Targets:"
@echo " impresso-lid # Run the full impresso Language Identification (LID) pipeline."
@echo " impresso-lid-stage1a-target # Process initial language identification for each content item."
@echo " impresso-lid-stage1b-target # Collect and summarize statistics from Stage 1a."
@echo " impresso-lid-stage2-target # Finalize language decisions and generate diagnostics."
@echo " impresso-lid-upload-release-to-s3 # Upload the processed data to an AWS S3 bucket."
@echo " impresso-lid-statistics # Generate statistics from the processed data."
@echo " impresso-lid-eval # Evaluate the LID results against a gold standard."
@echo " update-requirements # Update the Python dependencies file."
@echo " help # Show this help message"
# Default goal set to the help target
.DEFAULT_GOAL := help
# Add the help target to the list of phony targets to ensure it runs as needed without prerequisites
.PHONY: help $(PHONY_TARGETS)
# generally export all variables to sub-make calls (needed in this Makefile)
# The targets of stage 1a need the targets of stage 1b to exist
#export
# Note: use this target only on a single build machine
# If you run the commands on several machines on the same collection each stage has to be finished on all machines
# before moving to the next stage
#: Run full impresso LID pipeline build
impresso-lid:
# INFO: Recursively making impresso-lid-stage1a-target
$(MAKE) $(MAKEFILEFLAG) -f $(firstword $(MAKEFILE_LIST)) impresso-lid-stage1a-target
# INFO: Recursively making impresso-lid-stage1b-target
$(MAKE) $(MAKEFILEFLAG) -f $(firstword $(MAKEFILE_LIST)) impresso-lid-stage1b-target
# INFO: Recursively making impresso-lid-stage2-target
$(MAKE) $(MAKEFILEFLAG) -f $(firstword $(MAKEFILE_LIST)) impresso-lid-stage2-target
include lib/debug.mk
# emit additional diagnostics while building
DEBUG ?= 0
# additionally print diagnostic output on terminal in debug mode
ifeq ($(DEBUG),1)
TARGET_LOG_MACRO = 2> >(tee [email protected] 1>&2)
# if you want to tee and redirect stdout AND stderr, you need to write 1>2& AFTER the macro in a rule.
# Remember that the order of redirections matters! https://stackoverflow.com/q/17975232
else
TARGET_LOG_MACRO = 2> [email protected]
endif
# additionally print diagnostic output on terminal in debug mode
ifeq ($(DEBUG),1)
DEBUG_OPTION = --verbose 4
else
DEBUG_OPTION =
endif
##########################################################################################
# Make variables for impresso data infrastructure
# Variables in uppercase and underscores can be overwritten by the user at build time
# make sure that this directory points to a local copy of the impresso s3 data containers
# only read access is needed
IMPRESSO_REBUILT_DATA_DIR ?= rebuilt-data
# build dir
BUILD_DIR ?= build
# Language identification version
LID_VERSION ?= v1.4.4
# write access is needed here
LID_BUILD_DIR ?= $(BUILD_DIR)/$(LID_VERSION)
# all LID systems to use
LID_SYSTEMS ?= langid langdetect impresso_ft wp_ft
# fast text models
IMPPRESSO_FASTTEXT_MODEL ?= models/fasttext/impresso-lid.bin
WIKIPEDIA_FASTTEXT_MODEL ?= models/fasttext/lid.176.bin
# minimal text length threshold for automatic LID in stage 1 and 2
STAGE1A_MINIMAL_TEXT_LENGTH ?= 40
STAGE1B_MINIMAL_TEXT_LENGTH ?= 200
STAGE2_MINIMAL_TEXT_LENGTH ?= 50
# hyperparameters for scoring the languages
BOOST_FACTOR ?= 1.5
WEIGHT_LB_IMPRESSO ?= 6
MINIMAL_VOTING_SCORE ?= 0.5
STAGE1_MINIMAL_LID_PROBABILITY ?= 0.20
STAGE2_MINIMAL_LID_PROBABILITY ?= 0.5
MINIMAL_VOTE_SCORE ?= 1.5
# evaluation mode
EVALUATION_OUTPUT_FORMAT ?= json
# S3 bucket path (without "/" suffix)
S3_BUCKET_LANGIDENT_PATH ?= /42-processed-data-final/langident
stage2-dir := stage2
ifeq ($(EVAL_STAGE2),1)
stage2-dir := stage2-mvs$(MINIMAL_VOTING_SCORE)-mlp$(MINIMAL_LID_PROBABILITY)-wli$(WEIGHT_LB_IMPRESSO)
endif
# all known collection acronyms from the file system
# but filter out anything with a hyphen in it, e.g. rebuilt_v1-1-0.json
COLLECTION_ACRONYMS ?= $(filter-out %-%, $(notdir $(wildcard $(IMPRESSO_REBUILT_DATA_DIR)/*)))
# emit content of make variable if $(DEBUG) is set to 1
$(eval $(call debug_variable,COLLECTION_ACRONYMS))
# get path of all impresso rebuilt files
impresso-rebuilt-files := \
$(wildcard \
$(foreach ca,$(COLLECTION_ACRONYMS),\
$(IMPRESSO_REBUILT_DATA_DIR)/$(ca)/*.jsonl.bz2\
)\
)
########################################################################################################################
# stage 1a: apply lid classification to all content items
impresso-lid-stage1a-files := $(subst $(IMPRESSO_REBUILT_DATA_DIR),$(LID_BUILD_DIR)/stage1,$(impresso-rebuilt-files))
$(eval $(call debug_variable,impresso-lid-stage1a-files))
#: Generate all stage 1a files
impresso-lid-stage1a-target: $(impresso-lid-stage1a-files)
$(LID_BUILD_DIR)/stage1/%.jsonl.bz2: $(IMPRESSO_REBUILT_DATA_DIR)/%.jsonl.bz2
mkdir -p $(@D) \
&& if test -e [email protected] || test -e [email protected] ; \
then { echo "Already building/built $@ " && exit 0 ; } ; \
else { echo "$${HOSTNAME}" > [email protected] ; echo "$$(date -Iseconds) Building $@ now..." ; } ; \
fi \
&& trap 'rm -fv [email protected] ' EXIT HUP TERM SIGINT \
&& python lib/language_identification.py \
--lids $(LID_SYSTEMS) \
--impresso-ft $(IMPPRESSO_FASTTEXT_MODEL) \
--wp-ft $(WIKIPEDIA_FASTTEXT_MODEL) \
--minimal-text-length $(STAGE1A_MINIMAL_TEXT_LENGTH) \
--round-ndigits 3 \
--git-describe $$(git describe) \
--infile $< \
--outfile $@.$${HOSTNAME}.working.jsonl.bz2 \
$(DEBUG_OPTION) \
$(TARGET_LOG_MACRO) 1>&2 \
&& mv $@.$${HOSTNAME}.working.jsonl.bz2 $@ \
&& mv [email protected] [email protected] \
&& echo "$$(date -Iseconds) build of $@ finished successfully."
# &> >(tee [email protected] >&2)
# Note: we use the idiom &> >(tee [email protected] >&2) because the LID systems output log differently
# https://stackoverflow.com/questions/692000/how-do-i-write-stderr-to-a-file-while-using-tee-with-a-pipe
########################################################################################################################
# Stage 1b second part: Collect lid statistics per collection
# As stage 1a can be run in parallel on multiple machines we have to compute the successfull finishing of all stage 1a file before actually doing stage 1b
# collect statistics on stage 1a results per newspaper
impresso-lid-stage1b-files:= \
$(addprefix $(LID_BUILD_DIR)/stage1/,\
$(foreach ca,$(COLLECTION_ACRONYMS),$(ca).stats.json))
$(eval $(call debug_variable,impresso-lid-stage1b-files))
# a .done stamp file for each collection to indicate completion for the next stage
impresso-lid-stage1a-done-files := $(foreach ca,$(COLLECTION_ACRONYMS),$(LID_BUILD_DIR)/stage1/$(ca).done)
# template for specifying per collection
define stage1a_done_rule_template
$(LID_BUILD_DIR)/stage1/$(ca).done : $(filter /$(ca)/,$(impresso-lid-stage1a-files))
touch $$@
endef
$(eval $(foreach ca,$(COLLECTION_ACRONYMS),$(stage1a_done_rule_template)))
$(LID_BUILD_DIR)/stage1/%.stats.json: $(LID_BUILD_DIR)/stage1/%.done
python lib/collection_statistics.py \
--collection $* \
--lids $(LID_SYSTEMS) \
--boosted-lids orig_lg impresso_ft \
--minimal-text-length $(STAGE1B_MINIMAL_TEXT_LENGTH) \
--boost-factor $(BOOST_FACTOR) \
--minimal-vote-score $(MINIMAL_VOTE_SCORE) \
--minimal-lid-probability $(STAGE1_MINIMAL_LID_PROBABILITY) \
--git-describe $$(git describe) \
$(DEBUG_OPTION) \
$(<:.done=)/$(*)*.jsonl.bz2 \
| sponge $@ \
$(TARGET_LOG_MACRO) \
&& echo "$$(date -Iseconds) build of $@ finished successfully." \
|| { echo "Warning: Something went wrong while building $@. Check [email protected]. Cleaning up $@ now." ; rm -vf $@ ; exit 1 ; }
# Concatenate all newspaper stats in one file
$(LID_BUILD_DIR)/stage1.stats.json: $(impresso-lid-stage1a-done-files) $(impresso-lid-stage1b-files)
cat $+ > $@
#: Generate all stage 1b files
impresso-lid-stage1b-target: impresso-lid-stage1a-target \
$(impresso-lid-stage1b-files) \
$(LID_BUILD_DIR)/stage1.stats.json
########################################################################################################################
# Stage 2 second part: Decide for a language given collection statistics and individual content item predictions
impresso-lid-stage2-files := $(subst $(IMPRESSO_REBUILT_DATA_DIR),$(LID_BUILD_DIR)/$(stage2-dir),$(impresso-rebuilt-files))
$(eval $(call debug_variable,impresso-lid-stage2-files))
impresso-lid-stage2-diagnostics-files := $(impresso-lid-stage2-files:.jsonl.bz2=.diagnostics.json)
#: Generate all stage 2 files
impresso-lid-stage2-target: impresso-lid-stage1b-target $(impresso-lid-stage2-files) $(impresso-lid-stage2-diagnostics-files)
# rule for building all stage 2 files
$(LID_BUILD_DIR)/$(stage2-dir)/%.jsonl.bz2 $(LID_BUILD_DIR)/$(stage2-dir)/%.diagnostics.json: $(LID_BUILD_DIR)/stage1/%.jsonl.bz2
mkdir -p $(@D) \
&& python lib/impresso_lid.py \
--lids $(LID_SYSTEMS) \
--weight-lb-impresso-ft $(WEIGHT_LB_IMPRESSO) \
--minimal-lid-probability $(STAGE2_MINIMAL_LID_PROBABILITY) \
--minimal-voting-score $(MINIMAL_VOTING_SCORE) \
--minimal-text-length $(STAGE2_MINIMAL_TEXT_LENGTH) \
--collection-stats-filename $(patsubst %/,%.stats.json,$(subst /$(stage2-dir),/stage1,$(dir $@))) \
--git-describe $$(git describe) \
--validate \
--diagnostics-json $(@:jsonl.bz2=)diagnostics.json \
--infile $< \
--outfile [email protected] \
$(DEBUG_OPTION) \
$(TARGET_LOG_MACRO) \
&& mv [email protected] $@ \
&& echo "$$(date -Iseconds) build of $@ finished successfully." \
|| { echo "Warning: Something went wrong while building $@. Check [email protected]. Cleaning up $@ now." ; rm -vf $@ ; exit 1 ; }
########################################################################################################################
# Prepare official distribution for impresso with files per year
#: Actually upload the impresso lid information to s3 impresso bucket
impresso-lid-upload-release-to-s3: impresso-lid-stage2-target
rclone --dry-run --verbose copy $(LID_BUILD_DIR)/stage2/ s3-impresso:$(S3_BUCKET_LANGIDENT_PATH)/$(LID_VERSION) --include "*.jsonl.bz2" --ignore-existing \
&& rclone --verbose check $(LID_BUILD_DIR)/stage2/$(LID_VERSION)/ s3-impresso:$(S3_BUCKET_LANGIDENT_PATH)/$(LID_VERSION)/
########################################################################################################################
# Produce statistics
impresso-lid-stage2-diagnostics-files-manifest-target: \
$(LID_BUILD_DIR)/statistics.d \
$(LID_BUILD_DIR)/statistics.d/impresso-lid-stage2-diagnostics-files-manifest.txt
$(LID_BUILD_DIR)/statistics.d/impresso-lid-stage2-diagnostics-files-manifest.txt: $(impresso-lid-stage2-diagnostics-files)
mkdir -p $(@D) && $(file > $@,$+)
# create directory
%.d:
mkdir -p $@
#: Compute several statistics on the output of impresso LID
impresso-lid-statistics: \
$(LID_BUILD_DIR)/statistics.d/per-collection-year-contentitems.tsv \
$(LID_BUILD_DIR)/statistics.d/collection-year-language-data.tsv
#: Simple check whether number of content items per collection-year pair matches other impresso processing statistics
$(LID_BUILD_DIR)/statistics.d/per-collection-year-contentitems.tsv: $(impresso-lid-stage2-diagnostics-files)
mkdir -p $(@D) \
&& cat $+ | jq -r '.N|to_entries[0]|[.key,.value]|@tsv' | sort | sponge $@
$(LID_BUILD_DIR)/statistics.d/collection-year-language-data.tsv: $(impresso-lid-stage2-diagnostics-files)
cat $+ | jq -r '(.N|to_entries[0]|.key|split("-")) as [$$collection,$$year]| (.lg|to_entries|map({key,value,$$collection,$$year})|.[]|[.collection,.year,.key,.value]|sort_by(.0,.1,.2)|@tsv)' |sort | sponge $@
########################################################################################################################
# Evaluate against gold standard
#: Perform evaluation
impresso-lid-eval: $(LID_BUILD_DIR)/$(stage2-dir).eval.all.$(EVALUATION_OUTPUT_FORMAT)
$(LID_BUILD_DIR)/$(stage2-dir).eval.all.$(EVALUATION_OUTPUT_FORMAT): impresso-lid-stage2-target
python lib/impresso_lid_eval.py \
< test/ground-truth/all.jsonl \
--file-extension jsonl.bz2 \
--data-dir $(LID_BUILD_DIR)/$(stage2-dir) \
--diagnostics-json $(@:$(EVALUATION_OUTPUT_FORMAT)=)diagnostics.jsonl \
--output-format $(EVALUATION_OUTPUT_FORMAT) \
$(DEBUG_OPTION) \
$(TARGET_LOG_MACRO) \
| sponge $@
update-requirements:
pipenv requirements > requirements.txt