Skip to content

Commit

Permalink
✨ added DELA extracter
Browse files Browse the repository at this point in the history
  • Loading branch information
maxB9F authored Jul 28, 2023
1 parent cd1287a commit 54daf7d
Showing 1 changed file with 114 additions and 0 deletions.
114 changes: 114 additions & 0 deletions tools/dela_extracter.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
<!doctype html>
<html lang="fr">
<head>
<meta charset="utf-8">
<!--
COATL - Alignment Tool for Litterary (or other) texts.
Copyright (C) 2023 Maxime Kremer
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
-->
<title>DELA EXTRACTER FOR COATL</title>
<script>
/**
* A word in the dictionary.
* @typedef {Object} word
* @property {string[]} l - Lemmas
* @property {!string[]} t - Types
* @property {?string[]} n - Grammatical number(s) and gender(s) if not a verb
* @property {?string[]} c - Grammatical conjugation(s) if a verb
*/

var openFile = function(event) {
var input = event.target;
input.style.display = "none";
var reader = new FileReader();
reader.onload = function() {
var text = reader.result;
var node = document.getElementById("output");
node.innerText = text;
};
reader.readAsText(input.files[0]);
document.getElementById("name_storer").innerText = input.files[0].name;
document.getElementById("create_json").style.display = "block";
};
function download(filename, text) {
var element = document.createElement("a");
element.setAttribute("href", "data:text/plain;charset=utf-8," + encodeURIComponent(text));
element.setAttribute("download", filename);
element.style.display = "none";
document.body.appendChild(element);
element.click();
document.body.removeChild(element);
}
document.addEventListener("DOMContentLoaded", function() {
document.getElementById("create_json").addEventListener("click", function(event) {
event.target.style.display = "none";
let text = document.getElementById("output").innerText;

// JSON maker script : text input should be uncompressed dictionary from Unitex GramLab, it was made for DELA (french)

let text_array = text.split(/[\r\n]+/).filter(line => line.length > 0);
console.log("nombre d'entrées initial : "+text_array.length);
//we remove coumpounds with whitespaces, apostrophes and dashes :
text_array = text_array.filter(line => !((/[\s'-]+/).test(line)));
console.log("nombre d'entrées après premier filtre : "+text_array.length);
//we remove 3rd layer :
text_array = text_array.filter(line => !((/z3/).test(line)));
console.log("nombre d'entrées après deuxième filtre : "+text_array.length);
var return_object = {};
var numbobj = {};
for (var line of text_array) {
let word = {};
let ar1 = line.split('.');
let ar2 = ar1[0].split(',');
word.l = (ar2.length > 1 && ar2[1].length > 0) ? [ar2[1]] : [ar2[0]];
let ar3 = ar1[1].split(':');
word.t = ar3[0].split('+').filter(type => !((/^(z|Hum|NPropre|PFX)/).test(type)));
if (word.t.length == 0) continue;
let types = word.t.sort().join(",");
if (types in numbobj) numbobj[types]++;
else numbobj[types] = 1;
if (ar3.length > 1) {
if (word.t[0] == 'V') word.c = ar3.slice(1);
else word.n = ar3.slice(1);
}
if (ar2[0] in return_object) { //fusion of two words in the dictionary spelt the same
return_object[ar2[0]].l = [...new Set([...return_object[ar2[0]].l, ...word.l])];
return_object[ar2[0]].t = [...new Set([...return_object[ar2[0]].t, ...word.t])];
if ("n" in word) {
if ("n" in return_object[ar2[0]]) { return_object[ar2[0]].n = [...new Set([...return_object[ar2[0]].n, ...word.n])];}
else return_object[ar2[0]].n = word.n;
}
if ("c" in word) {
if ("c" in return_object[ar2[0]]) { return_object[ar2[0]].c = [...new Set([...return_object[ar2[0]].c, ...word.c])];}
else return_object[ar2[0]].c = word.c;
}
}
else return_object[ar2[0]] = word;
}
console.log("nombre d'entrées final : "+Object.keys(return_object).length);
console.log(numbobj);
download(document.getElementById("name_storer").innerText + ".json", JSON.stringify(return_object));
});
});
</script>
</head>
<body>
<input type="file" accept="text/plain" onchange="openFile(event)">
<button id="create_json" type="button" style="display:none">Create .json file</button>
<div id="output"></div>
<div id="name_storer" style="display:none"></div>
</body>
</html>

0 comments on commit 54daf7d

Please sign in to comment.