-
Notifications
You must be signed in to change notification settings - Fork 1
/
settings.template.py
143 lines (120 loc) · 3.57 KB
/
settings.template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#coding=utf-8
"""This file contains common settings for all scripts
"""
from modules import storage
from modules.download import FileType
allowedFiletypes = [FileType.PDF,
FileType.DOC,
FileType.DOCX,
FileType.RTF,
FileType.HTML]
"""What filetypes should we download and store?"""
user_agent = "Protokollen (http://protokollen.net)"
"""How do we identify ourselves?"""
browser = "firefox"
"""
Selenium browser to surf the web with.
To use Chrome instead, try:
browser = "chrome"
and add the chromedriver executable to the bin directory. Get if from
http://chromedriver.storage.googleapis.com/index.html
"""
Storage = storage.LocalStorage
"""
You need to set the credentials below to use an external storage
To use Dropbox, try
Storage = storage.DropboxStorage
To use Amazon S3, try
Storage = storage.S3Storage
"""
Database = None
"""
You need to set the credentials below to use a database.
Remove or set to None to disable database indexing
To use ElasticSearch, try
from modules.databases.elasticsearchdb import ElasticSearch
Database = ElasticSearch
"""
#access_key_id = None
#secret_access_key = None
#access_token = None
#bucket_name = 'protokollen'
#text_bucket_name = 'protokollen-text'
"""
Storage settings, for e.g. Amazon S3, Dropbox, etc.
Use `Storage` above, to choose your preferred method.
"""
#db_server = 'localhost'
#db_port = None
#db_harvest_table = 'files'
#db_extactor_table = 'documents'
"""
Database credentials.
Use `Database` above, to chose your preferred db.
"""
#google_client_email = None
#google_p12_file = None
#google_spreadsheet_key = None
"""
Google API credentials, if you want to use Google Docs
to define harvesting rules.
"""
#error_api_token = "your_secret_token"
#error_api_url = "http://your_error_api"
"""
A token needed to make requests to our own Error API.
"""
#tagger_api_url = "http://tagger_api_url"
#tagger_api_key = "12345678"
"""
Auth to Textual Relations text tagger API
"""
document_rules = [
("kommunstyrelseprotokoll",
("and", [
("or", [
("header_contains", "protoko"), # OCR often confuse l with i
("header_contains", "sammanträde")
]),
("or", [
("header_contains", "kommunstyrelse"),
("header_contains", "regionstyrelse") # Gotland
]),
("not",
("header_contains", "arbetsutskott") # Do not include KSAU
)
])
),
("kallelse",
("and", [
("header_contains", "kallelse"),
("not",
("header_contains", "protoko")
)
])
)
]
"""
What defines a document? Extractor will look for these words in page headers,
to determine which pages from a file belong to the same document,
in case one file contains many documents. Strings are case insensitive.
Set this no None, or remove, if each file should be considered
one document.
Syntax
------
A list of tuples (name, rules), where `rules` are nested lists and tuples:
"and": [], "or": [], "not": (), "header_contains": ""
"""
document_type_settings = {
'kommunstyrelseprotokoll': {
'disallow_infixes': True
}
}
"""
Various document type specific settings:
disallow_infixes:
If set to true, “holes” in a document is not considered separate documents.
A file where the pages are identified as `ABAAACC` type pages, will be
stored as [A,C] if disallow_infixes is True for A, otherwise as [A,B,A,C]
default: False
"""