forked from nevali/crawl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.conf
111 lines (93 loc) · 3.93 KB
/
crawl.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
[crawler]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Configuration for this specific instance of the crawler
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; the crawler ID is a numeric identifier used to identify a single crawler
;; thread, and must be both unique and used contiguously.
;; if you specify id=1 and thread=8, then this instance of crawld
;; would launch eight threads, identifying themselves as crawler 1, crawler 2,
;; and so on up to crawler 8. you would then specify crawler=9 in the next
;; instance that you configure, and so on.
;id=1
;; Specify the number of threads in this instance
;threads=1
;; if crawling should happen verbosely, set this to 1
; verbose=yes
;; Whether to fork into the background or not
; detach=yes
[cluster]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Cluster configuration
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; The environment is used in logging to distinguish groups of instances
;; from one another. For dynamic clustering, it's also used to generate
;; the etcd key paths, so that members in different environments using the
;; same etcd cluster don't interfere with one another.
;environment=production
;; Dynamic clustering will use etcd if configured to do so, in which case
;; the crawler ID specified above will be ignored. Both a name (used to
;; create the etcd key path) and registry URL must be specified
;name=anansi
;registry=http://localhost:2379/
;; Static clustering is used if dynamic clustering is not configured.
;; The total number of crawl threads across the cluster should be configured
;; here (and the same value across all members of the cluster)
;threads=1
[queue]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Queue configuration
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
name=db
;; if using the 'db' queue module, specify a database connection URI
uri=mysql://root@localhost/anansi
;; set to true to enable query debugging
debug-queries=no
;; set to true to enable error debugging
debug-errors=no
[processor]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Processor configuration
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; The processor module is responsible for parsing retrieved resources
;; and adding new links to the crawl queue as a result. Some processor
;; modules may apply specific policies to the process, rejecting certain
;; resources based upon pre-defined criteria.
;; The 'rdf' processor parses resources as RDF and adds any URIs found in
;; the graphs to the crawl queue.
name=rdf
;; The 'lod' processor is an extension of the RDF processor which also
;; performs licensing checks upon the resources, rejecting any which aren't
;; explicitly licensed as open data.
; name=lod
;; The LOD processor must be configured with predicates and a licensing
;; white and/or blacklist to be useful.
[lod:licenses]
predicate="http://purl.org/dc/terms/rights"
predicate="http://purl.org/dc/terms/license"
predicate="http://purl.org/dc/terms/accessRights"
predicate="http://creativecommons.org/ns#license"
predicate="http://www.w3.org/1999/xhtml/vocab#license"
whitelist="http://creativecommons.org/publicdomain/zero/1.0/"
[cache]
;; specify the location of the cache
uri=/var/spool/anansi
; uri=s3://anansi/
; username=user
; password=pass
; endpoint=s3.amazonaws.com
[log]
;; set stderr=1 to log output to standard error, not just syslog
;stderr=1
;; set the logging threshold level. valid values are emerg, alert, crit, err,
;; warn, notice, info, or debug.
;level=notice
;; set the logging facility, valid values include user, local0..local7,
;; although others may be valid on your system.
;level=daemon
;; Policy sections
[policy:content-types]
; blacklist=text/plain,application/x-unknown
; whitelist=text/html
[policy:schemes]
; whitelist=http
; blacklist=scp,mailto