-
Notifications
You must be signed in to change notification settings - Fork 0
/
postprocess.py
147 lines (121 loc) · 4.72 KB
/
postprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
gets rid of dead links indexed by Obsidian-Hugo. Definitely not the most efficient thing to do, but hopefully it gets the job done.
the index file usually at "assets/indices/linkIndex.json"
Basic idea:
- read linkIndex.json
- loop through each index entry
- loop through each links entry
- loop through each node
- convert character encoding
- get rid of link that don't correspond to file in "content"
- loop through each backlinks entry
- loop through each node
- convert character encoding
- get rid of link that don't correspond to file in "content"
- loop through each link entry
- convert character encoding
- get rid of link that don't correspond to file in "content"
Also, Obsidian-Hugo seems to mistake internal block or section references as references to root, so all links target at "/" will get removed
Fix missing orphans by adding self link
"""
import json
import urllib.parse
import os
import re
INDEX_FILE = "./assets/indices/linkIndex.json"
CONTENT_FOLDER = "./content"
def decode_url_encoding(s) -> str:
return urllib.parse.unquote(s)
def load_json(path) -> dict:
f = open(path)
data = json.load(f)
f.close()
return data
def strip_name(s) -> str:
# gets rid of symbols, spaces, etc.
return (s.replace(".md", "")
.replace("?", "")
.replace("&", "")
.replace("!", "")
.replace("/", "")
.replace(" ", "")
.replace("-", "")
.replace("/", "")
.replace("\\", "")
.replace("%", "")
.replace(":", "")
.replace("(", "")
.replace(")", "")
.replace("|", "")
.replace('"', "")
.replace("'", "")
.replace(".", "")
.replace(",", "")
.replace(";", "")
)
def md_file_existence_heuristic(existing_files_set, encoded_url) -> bool:
return strip_name(encoded_url) in existing_files_set
data = load_json(INDEX_FILE)
existing_files = set([strip_name(file) for file in os.listdir(CONTENT_FOLDER)])
links_index: dict = data["index"]["links"]
backlinks_index: dict = data["index"]["backlinks"]
links_list: list = data["links"]
processed_data = {"index": {
"links": {},
"backlinks": {},
},
"links": [],
}
# === Remove false links ===
for key in links_index.keys():
if md_file_existence_heuristic(existing_files, key):
processed_data["index"]["links"][key] = []
else:
# print(f'removing {key}')
continue
for i, entry in enumerate(links_index[key]):
if md_file_existence_heuristic(existing_files, entry["target"]):
processed_data["index"]["links"][key] += [entry]
else:
# print(f'removing {entry}')
continue
for key in backlinks_index.keys():
if md_file_existence_heuristic(existing_files, key):
processed_data["index"]["backlinks"][key] = []
else:
# print(f'removing {key}')
continue
for i, entry in enumerate(backlinks_index[key]):
if md_file_existence_heuristic(existing_files, entry["target"]):
processed_data["index"]["backlinks"][key] += [entry]
else:
# print(f'removing {entry}')
continue
for i, entry in enumerate(links_list):
if md_file_existence_heuristic(existing_files, entry["target"]) and ((entry["source"] == "/") or md_file_existence_heuristic(existing_files, entry["source"])):
processed_data["links"] += [entry]
else:
# print(f'removing {entry}')
continue
# Print deletion summary
print(f'POSTPROCESS: removed {len(data["index"]["links"]) - len(processed_data["index"]["links"])} false outbound links from index of {len(data["index"]["links"])} links')
print(f'POSTPROCESS: removed {len(data["index"]["backlinks"]) - len(processed_data["index"]["backlinks"])} false backlinks from index of {len(data["index"]["backlinks"])} links')
print(f'POSTPROCESS: removed {len(data["links"]) - len(processed_data["links"])} false links of {len(data["links"])} links')
# === fix orphans ===
linked_nodes = set([entry["source"] for entry in processed_data["links"]]).union(set([entry["target"] for entry in processed_data["links"]]))
orphans_added = 0
for key in links_index.keys():
if key not in linked_nodes:
processed_data["links"] += [{
"source": key,
"target": key,
"text": key
}]
orphans_added += 1
# print(f"add self link for {key}")
print(f'POSTPROCESS: added {orphans_added} orphans to the graph')
# BUG doen't work if orphan is orphan in Obsidian. i.e. only work for those orphaned because of false link removal
# Writing to sample.json
json_object = json.dumps(processed_data, indent=2)
with open("./assets/indices/linkIndex.json", "w") as outfile:
outfile.write(json_object)