✨ added DELA extracter

maxB9F · Jul 28, 2023 · 54daf7d · 54daf7d
1 parent cd1287a
commit 54daf7d
Showing 1 changed file with 114 additions and 0 deletions.
diff --git a/tools/dela_extracter.html b/tools/dela_extracter.html
@@ -0,0 +1,114 @@
+<!doctype html>
+<html lang="fr">
+<head>
+    <meta charset="utf-8">
+    <!--
+    COATL - Alignment Tool for Litterary (or other) texts.
+    Copyright (C) 2023 Maxime Kremer
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+    <title>DELA EXTRACTER FOR COATL</title>
+    <script>
+        /**
+         * A word in the dictionary.
+         * @typedef {Object} word
+         * @property {string[]} l - Lemmas
+         * @property {!string[]} t - Types
+         * @property {?string[]} n - Grammatical number(s) and gender(s) if not a verb
+         * @property {?string[]} c - Grammatical conjugation(s) if a verb
+        */
+
+        var openFile = function(event) {
+            var input = event.target;
+            input.style.display = "none";
+            var reader = new FileReader();
+            reader.onload = function() {
+                var text = reader.result;
+                var node = document.getElementById("output");
+                node.innerText = text;
+            };
+            reader.readAsText(input.files[0]);
+            document.getElementById("name_storer").innerText = input.files[0].name;
+            document.getElementById("create_json").style.display = "block";
+        };
+        function download(filename, text) {
+            var element = document.createElement("a");
+            element.setAttribute("href", "data:text/plain;charset=utf-8," + encodeURIComponent(text));
+            element.setAttribute("download", filename);
+            element.style.display = "none";
+            document.body.appendChild(element);
+            element.click();
+            document.body.removeChild(element);
+        }
+        document.addEventListener("DOMContentLoaded", function() {
+            document.getElementById("create_json").addEventListener("click", function(event) {
+                event.target.style.display = "none";
+                let text = document.getElementById("output").innerText;
+
+                // JSON maker script : text input should be uncompressed dictionary from Unitex GramLab, it was made for DELA (french)
+
+                let text_array = text.split(/[\r\n]+/).filter(line => line.length > 0);
+                console.log("nombre d'entrées initial : "+text_array.length);
+                //we remove coumpounds with whitespaces, apostrophes and dashes :
+                text_array = text_array.filter(line => !((/[\s'-]+/).test(line)));
+                console.log("nombre d'entrées après premier filtre : "+text_array.length);
+                //we remove 3rd layer :
+                text_array = text_array.filter(line => !((/z3/).test(line)));
+                console.log("nombre d'entrées après deuxième filtre : "+text_array.length);
+                var return_object = {};
+                var numbobj = {};
+                for (var line of text_array) {
+                    let word = {};
+                    let ar1 = line.split('.');
+                    let ar2 = ar1[0].split(',');
+                    word.l = (ar2.length > 1 && ar2[1].length > 0) ? [ar2[1]] : [ar2[0]];
+                    let ar3 = ar1[1].split(':');
+                    word.t = ar3[0].split('+').filter(type => !((/^(z|Hum|NPropre|PFX)/).test(type)));
+                    if (word.t.length == 0) continue;
+                    let types = word.t.sort().join(",");
+                    if (types in numbobj) numbobj[types]++;
+                    else numbobj[types] = 1;
+                    if (ar3.length > 1) {
+                        if (word.t[0] == 'V') word.c = ar3.slice(1);
+                        else word.n = ar3.slice(1);
+                    }
+                    if (ar2[0] in return_object) { //fusion of two words in the dictionary spelt the same
+                        return_object[ar2[0]].l = [...new Set([...return_object[ar2[0]].l, ...word.l])];
+                        return_object[ar2[0]].t = [...new Set([...return_object[ar2[0]].t, ...word.t])];
+                        if ("n" in word) {
+                            if ("n" in return_object[ar2[0]]) { return_object[ar2[0]].n = [...new Set([...return_object[ar2[0]].n, ...word.n])];}
+                            else return_object[ar2[0]].n = word.n;
+                        }
+                        if ("c" in word) {
+                            if ("c" in return_object[ar2[0]]) { return_object[ar2[0]].c = [...new Set([...return_object[ar2[0]].c, ...word.c])];}
+                            else return_object[ar2[0]].c = word.c;
+                        }
+                    }
+                    else return_object[ar2[0]] = word;
+                }
+                console.log("nombre d'entrées final : "+Object.keys(return_object).length);
+                console.log(numbobj);
+                download(document.getElementById("name_storer").innerText + ".json", JSON.stringify(return_object));
+            });
+        });
+    </script>
+</head>
+<body>
+    <input type="file" accept="text/plain" onchange="openFile(event)">
+    <button id="create_json" type="button" style="display:none">Create .json file</button>
+    <div id="output"></div>
+    <div id="name_storer" style="display:none"></div>
+</body>
+</html>