-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathconfig.example.json
52 lines (52 loc) · 1.37 KB
/
config.example.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
{
"debug": true,
"userAgent": "SSScraper",
"htmlCache": {
"directory": "./cache"
},
"pdfCache": {
"directory": "./pdf-cache"
},
"request": {
"timeoutInMs": 2000,
"domainGlob": "**",
"parallelism": 2,
"delayInMs": 300,
"randomDelayInMs": 1500
},
"input": {
"startUrl": "https://gotripod.com/",
"urlFilters": ["^https://gotripod.com/.*"],
"disallowedUrlFilters": ["^https://gotripod.com/private/.*"]
},
"output": {
"type": "jsonl",
"filename": "documents.jsonl",
"webhook": null
},
"html": {
"selectors": {
"id": "{{ .Request.URL }}",
"title": "title",
"statusCode": "{{ .Response.StatusCode }}",
"content": "#content",
"metaDescription": "meta[name=\"description\"]",
"metaKeywords": "meta[name=\"keywords\"]",
"h1": "h1",
"h2": "h2",
"url": "{{ .Request.URL }}",
"section": "{{ .Request.URL.Path | regexFind \"/(insights|work)/\" | replace \"/\" \"\" }}",
"urlPathSplit": "{{ .Request.URL.Path | splitList \"/\" | join \" \" }}"
}
},
"pdf": {
"enabled": true,
"selectors": {
"id": "{{ .Request.URL }}",
"title": "{{ index .Meta \"title\" }}",
"url": "{{ .Request.URL }}",
"content": "{{ .TextContent }}",
"section": "{{ .Request.URL.Path | regexFind \"/(insights|work)/\" | replace \"/\" \"\" }}"
}
}
}