-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
114 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
<!doctype html> | ||
<html lang="fr"> | ||
<head> | ||
<meta charset="utf-8"> | ||
<!-- | ||
COATL - Alignment Tool for Litterary (or other) texts. | ||
Copyright (C) 2023 Maxime Kremer | ||
This program is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
You should have received a copy of the GNU General Public License | ||
along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
--> | ||
<title>DELA EXTRACTER FOR COATL</title> | ||
<script> | ||
/** | ||
* A word in the dictionary. | ||
* @typedef {Object} word | ||
* @property {string[]} l - Lemmas | ||
* @property {!string[]} t - Types | ||
* @property {?string[]} n - Grammatical number(s) and gender(s) if not a verb | ||
* @property {?string[]} c - Grammatical conjugation(s) if a verb | ||
*/ | ||
|
||
var openFile = function(event) { | ||
var input = event.target; | ||
input.style.display = "none"; | ||
var reader = new FileReader(); | ||
reader.onload = function() { | ||
var text = reader.result; | ||
var node = document.getElementById("output"); | ||
node.innerText = text; | ||
}; | ||
reader.readAsText(input.files[0]); | ||
document.getElementById("name_storer").innerText = input.files[0].name; | ||
document.getElementById("create_json").style.display = "block"; | ||
}; | ||
function download(filename, text) { | ||
var element = document.createElement("a"); | ||
element.setAttribute("href", "data:text/plain;charset=utf-8," + encodeURIComponent(text)); | ||
element.setAttribute("download", filename); | ||
element.style.display = "none"; | ||
document.body.appendChild(element); | ||
element.click(); | ||
document.body.removeChild(element); | ||
} | ||
document.addEventListener("DOMContentLoaded", function() { | ||
document.getElementById("create_json").addEventListener("click", function(event) { | ||
event.target.style.display = "none"; | ||
let text = document.getElementById("output").innerText; | ||
|
||
// JSON maker script : text input should be uncompressed dictionary from Unitex GramLab, it was made for DELA (french) | ||
|
||
let text_array = text.split(/[\r\n]+/).filter(line => line.length > 0); | ||
console.log("nombre d'entrées initial : "+text_array.length); | ||
//we remove coumpounds with whitespaces, apostrophes and dashes : | ||
text_array = text_array.filter(line => !((/[\s'-]+/).test(line))); | ||
console.log("nombre d'entrées après premier filtre : "+text_array.length); | ||
//we remove 3rd layer : | ||
text_array = text_array.filter(line => !((/z3/).test(line))); | ||
console.log("nombre d'entrées après deuxième filtre : "+text_array.length); | ||
var return_object = {}; | ||
var numbobj = {}; | ||
for (var line of text_array) { | ||
let word = {}; | ||
let ar1 = line.split('.'); | ||
let ar2 = ar1[0].split(','); | ||
word.l = (ar2.length > 1 && ar2[1].length > 0) ? [ar2[1]] : [ar2[0]]; | ||
let ar3 = ar1[1].split(':'); | ||
word.t = ar3[0].split('+').filter(type => !((/^(z|Hum|NPropre|PFX)/).test(type))); | ||
if (word.t.length == 0) continue; | ||
let types = word.t.sort().join(","); | ||
if (types in numbobj) numbobj[types]++; | ||
else numbobj[types] = 1; | ||
if (ar3.length > 1) { | ||
if (word.t[0] == 'V') word.c = ar3.slice(1); | ||
else word.n = ar3.slice(1); | ||
} | ||
if (ar2[0] in return_object) { //fusion of two words in the dictionary spelt the same | ||
return_object[ar2[0]].l = [...new Set([...return_object[ar2[0]].l, ...word.l])]; | ||
return_object[ar2[0]].t = [...new Set([...return_object[ar2[0]].t, ...word.t])]; | ||
if ("n" in word) { | ||
if ("n" in return_object[ar2[0]]) { return_object[ar2[0]].n = [...new Set([...return_object[ar2[0]].n, ...word.n])];} | ||
else return_object[ar2[0]].n = word.n; | ||
} | ||
if ("c" in word) { | ||
if ("c" in return_object[ar2[0]]) { return_object[ar2[0]].c = [...new Set([...return_object[ar2[0]].c, ...word.c])];} | ||
else return_object[ar2[0]].c = word.c; | ||
} | ||
} | ||
else return_object[ar2[0]] = word; | ||
} | ||
console.log("nombre d'entrées final : "+Object.keys(return_object).length); | ||
console.log(numbobj); | ||
download(document.getElementById("name_storer").innerText + ".json", JSON.stringify(return_object)); | ||
}); | ||
}); | ||
</script> | ||
</head> | ||
<body> | ||
<input type="file" accept="text/plain" onchange="openFile(event)"> | ||
<button id="create_json" type="button" style="display:none">Create .json file</button> | ||
<div id="output"></div> | ||
<div id="name_storer" style="display:none"></div> | ||
</body> | ||
</html> |