-
Notifications
You must be signed in to change notification settings - Fork 5
/
dedup.lean
69 lines (61 loc) · 2.89 KB
/
dedup.lean
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import LeanAide.PremiseData
open Lean Json Data LeanAide.Meta
def pairs? (s: String) : Option <| List IdentPair :=
let js? := Lean.Json.parse s |>.toOption
let ids? : Option IdentData :=
js?.bind (fun js => fromJson? js |>.toOption)
ids?.map <| fun id => id.unfold |>.toList
def allPairs (ss: List String) : List IdentPair :=
ss.foldl (fun acc s =>
match pairs? s with
| some ps => ps ++ acc
| none => acc
) []
def allPairsString (ss: List String) : String :=
jsonLines <| (allPairs ss).toArray
def main (_: List String) : IO Unit := do
IO.println "Deduplicating core"
-- let allLines :=
-- (← IO.FS.lines (System.mkFilePath ["rawdata", "premises", "core", "all.jsonl"])).filter (fun l => l != "") -- |>.toList.eraseDups
let trainLines :=
(← IO.FS.lines (System.mkFilePath ["rawdata", "premises", "core", "train.jsonl"])) --.filter (fun l => l != "") |>.toList.eraseDups
let testLines :=
(← IO.FS.lines (System.mkFilePath ["rawdata", "premises", "core", "test.jsonl"])) --.filter (fun l => l != "" && !trainLines.contains l) -- |>.toList.eraseDups
let validLines :=
(← IO.FS.lines (System.mkFilePath ["rawdata", "premises", "core", "valid.jsonl"])) --.filter (fun l => l != "" && !trainLines.contains l && !testLines.contains l) -- |>.toList.eraseDups
IO.println s!"Read lines: {trainLines.size}, {testLines.size}, {validLines.size}"
let handles ← mainFileHandles
let lineMap : Std.HashMap String <| Array String := Std.HashMap.ofList <| [("train", trainLines), ("test", testLines), ("valid", validLines)]
let mut prevLines: Std.HashSet String := {}
let mut count := 0
let mut dups := 0
for group in groups do
IO.println s!"Writing group {group}"
for l in lineMap.find! group do
if !prevLines.contains l then
count := count + 1
prevLines := prevLines.insert l
let js? := Lean.Json.parse l |>.toOption
let premise? : Option CorePremiseData :=
js?.bind <| fun js => (fromJson? js).toOption
match premise? with
| some corePremise =>
corePremise.write group handles
let identData :=
IdentData.ofCorePremiseData corePremise
identData.write group handles
identData.writeString group handles
let identPairs := identData.unfold
for identPair in identPairs do
identPair.write group handles
let termPairs := TermPair.ofCorePremiseData corePremise
for termPair in termPairs do
termPair.write group handles
let lemmaPairs := LemmaPair.ofCorePremiseData corePremise
for lemmaPair in lemmaPairs do
lemmaPair.write group handles
if count % 5000 = 0 then
IO.println s!"obtained {count} definitions; {dups} duplicates"
| none =>
IO.println s!"Could not parse: {l}"
else dups := dups + 1