-
Notifications
You must be signed in to change notification settings - Fork 2
/
get_parallel_data_dgt.py
58 lines (45 loc) · 1.96 KB
/
get_parallel_data_dgt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import requests
import json
import os
import subprocess
import zipfile
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm
# Download DGT Translation Memories via data.europa.eu API
res = requests.get("https://data.europa.eu/api/hub/search/datasets/dgt-translation-memory")
res = json.loads(res.text)
data = [
{k: v for k, v in d.items() if k in ["download_url", "status", "issued"]}
for d in res["result"]["distributions"]
]
df = pd.DataFrame(data)
df = pd.concat([df[["download_url", "issued"]], pd.json_normalize(df["status"])], axis=1)
df["download_url"] = df["download_url"].str[0]
os.makedirs("DGT-TM", exist_ok=True)
for url in df["download_url"]:
filename = url.split("/")[-1]
with open(os.path.join("DGT-TM", filename), "wb") as f:
r = requests.get(url)
f.write(r.content)
zip_files = [file for file in os.listdir("DGT-TM") if file.endswith(".zip")]
os.makedirs("DGT-TM/data", exist_ok=True)
for zip_file in zip_files:
with zipfile.ZipFile(os.path.join("DGT-TM", zip_file), "r") as zip_ref:
zip_ref.extractall("DGT-TM/data")
subprocess.check_call("cwm --rdf test.rdf --ntriples > test.nt", shell=True)
tree = ET.parse("DGT-TM/dgttm.tmx")
for tu in tqdm(tree.findall(".//body/tu"), total=len(tree.findall(".//body/tu"))):
if len(tu.findall(".//tuv")) < 2:
print("Unpaired translation. Ignoring...")
else:
# Get language attributes
srclang = tu.find(".//tuv").attrib["{http://www.w3.org/XML/1998/namespace}lang"]
targetlang = tu.find(".//tuv[2]").attrib["{http://www.w3.org/XML/1998/namespace}lang"]
# Get source sentence
srcsentence = tu.find(".//tuv/seg").text
# Get target sentence
targetsentence = tu.find(".//tuv[2]/seg").text
# Write srcsentence and targetsentence to tsv file append mode, using tab as delimiter
with open("parallel-sentences/DGT-TM-en-sv.tsv", "a") as f:
_ = f.write(srcsentence + "\t" + targetsentence + "\n")