Skip to content

Commit

Permalink
Merge pull request #1382 from Signbank/wordnet_link_1350
Browse files Browse the repository at this point in the history
Add synset wordnet links to glosses
  • Loading branch information
vanlummelhuizen authored Dec 11, 2024
2 parents 43d4326 + 1a9b609 commit 3a6f445
Show file tree
Hide file tree
Showing 6 changed files with 283 additions and 1 deletion.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
setuptools==70.0.0
asgiref==3.7.2
attrs==17.4.0
beautifulsoup4==4.12.3
Django==4.2.16
django-appconf==1.0.5
django-bootstrap3==22.2
Expand All @@ -19,6 +20,7 @@ future==0.18.3
flatdict==4.0.0
git+https://github.com/vanlummelhuizen/CNGT-scripts@master#egg=CNGT_scripts
lxml==4.9.3
nltk==3.9.1
numpy==1.26.2
py==1.11.0
pympi-ling==1.70.2
Expand Down
2 changes: 2 additions & 0 deletions signbank/dictionary/adminviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -1444,6 +1444,8 @@ def get_context_data(self, **kwargs):
else:
context['annotated_sentences'] = annotated_sentences[0:3]

context['synsets'] = gl.synsets.all()

bad_dialect = False
gloss_dialects = []

Expand Down
224 changes: 224 additions & 0 deletions signbank/dictionary/management/commands/update_wordnet_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
"""This script updates the WordNet links in the database. This works ONLY for NGT dataset."""

import os
import io
import shutil
import requests
import csv

from django.core.management import BaseCommand

from signbank.dictionary.models import Synset, Dataset, Gloss
from signbank.settings.server_specific import WORDNET_USERNAME, WORDNET_PASSWORD

import nltk
from nltk.corpus import wordnet as wn
from bs4 import BeautifulSoup

LOGIN_URL = "https://signwordnetannotation.pythonanywhere.com/login.html"
DOWNLOAD_SIGNS_LINKS_URL = "https://signwordnetannotation.pythonanywhere.com/generate_csv.html"
EXPECTED_LANDING_URL = "https://signwordnetannotation.pythonanywhere.com/"
DOWNLOAD_TAB_URL = "https://www.sign-lang.uni-hamburg.de/easier/sign-wordnet/static/tab/sign_wordnet_gloss_dse.tab"
LINK_BASE = "https://www.sign-lang.uni-hamburg.de/easier/sign-wordnet/synset/"

class Command(BaseCommand):
help = 'Update the WordNet links in the database. This works ONLY for NGT dataset.'

def download_wordnet_gloss(self):
""" Download the WordNet gloss file from the server. """
response = requests.get(DOWNLOAD_TAB_URL)
if response.status_code == 200:
glosses_wordnet = response.content
print("WordNet gloss file downloaded successfully.")
return glosses_wordnet
else:
print("Failed to download WordNet gloss file. Status code:", response.status_code)

def download_csv(self, session, csv_type):
""" Download the WordNet links and signs CSV file from the server. """
download_payload = {
"submit": csv_type
}
download_response = session.post(DOWNLOAD_SIGNS_LINKS_URL, data=download_payload)
if download_response.status_code == 200:
csv = download_response.content
print(f"{csv_type} CSV downloaded successfully.")
return csv
else:
print(f"Failed to download {csv_type} CSV. Status code:", download_response.status_code)

def download_links_csv(self):
""" Download the WordNet links CSV file from the server. """

session = requests.Session()
login_payload = {
"username": WORDNET_USERNAME,
"password": WORDNET_PASSWORD
}

# Send the login request
login_response = session.post(LOGIN_URL, data=login_payload)

# Check if login was successful by looking at the status code or response content
if login_response.url != EXPECTED_LANDING_URL:
print("Login failed or redirected incorrectly.")
return None
else:
print("Login successful!")

links_csv = self.download_csv(session, "links")
signs_csv = self.download_csv(session, "signs")

return links_csv, signs_csv

def get_links_data(self, links_csv, signs_csv):
""" Read the WordNet links CSV file and return the data. """

# Read the signs CSV file
signs = {}
decoded_signs_csv = signs_csv.decode('utf-8')
decoded_signs_csv = io.StringIO(decoded_signs_csv)
signs_csv_reader = csv.reader(decoded_signs_csv)
next(signs_csv_reader)

for row in signs_csv_reader:
wordnet_sign_id = row[0]
signbank_sign_id = row[1]
wordnet_sign_id = wordnet_sign_id.replace("ngt.", "").replace("'", '').replace(" ", "")
signbank_sign_id = signbank_sign_id.replace("'", '').replace(" ", "")
signs[wordnet_sign_id] = signbank_sign_id

# Read the links CSV file
links = {}
decoded_links_csv = links_csv.decode('utf-8')
decoded_links_csv = io.StringIO(decoded_links_csv)
links_csv_reader = csv.reader(decoded_links_csv)
next(links_csv_reader)

for row in links_csv_reader:
for r_i, r in enumerate(row):
row[r_i] = r.replace("'", '').replace(" ", "")

# only keep rows with confidence level >= 5
if int(row[2]) < 5:
continue

wordnet_sign_id = row[0].replace("ngt.", "")

link = f"{LINK_BASE}{row[1]}.html"
links_list = [row[1], row[2], row[3], link]
sign_id = wordnet_sign_id
if wordnet_sign_id in signs:
sign_id = signs[wordnet_sign_id]

if signs[wordnet_sign_id] not in links:
links[sign_id]=[]

links[sign_id].append(links_list)
return links


def get_lemma_definitions(self, glosses_wordnet):
""" Get the lemma names and definitions from the WordNet gloss file. """

rows = glosses_wordnet.decode("utf-8").split('\n')

# Save the lemma names and definitions in a dictionary
wn_dict = {}
for row in rows[1:]:
# check if row is empty
if not row:
continue
row_split = row.split('\t')
omw_id = row_split[0]
offset, pos = omw_id.split('-')
synset = wn.synset_from_pos_and_offset(pos, int(offset))
if synset:
wn_dict["omw."+omw_id] = [synset.lemma_names(), synset.definition()]

return wn_dict

def find_lemmas_description_in_html(self, html):
""" Find the lemmas and description in the HTML of the Multilingual Sign Language Wordnet page. """

soup = BeautifulSoup(html, "html.parser")

# Scrape the lemmas from the HTML
lemmas_paragraph = None
for p in soup.find_all("p"):
if "Lemmas:" in p.get_text():
lemmas_paragraph = p
lemmas = [lemma.strip() for lemma in lemmas_paragraph.get_text().replace("Lemmas:", "").split(",")]
lemmas = ", ".join(lemmas)
break
if lemmas_paragraph is None:
lemmas = ""

# Scrape the definition from the HTML
definition_paragraph = soup.find("p", class_="synset_def")
if definition_paragraph:
definition = definition_paragraph.get_text().replace("Definition:", "").strip()
else:
definition = ""

return lemmas, definition

def update_links_data(self, links, wn_dict):
""" Update the WordNet links in the database. """

ngt_dataset = Dataset.objects.get(acronym="NGT")

# unlink all synsets from glosses
for synset in Synset.objects.filter(glosses__lemma__dataset=ngt_dataset):
synset.glosses.clear()

for gloss_id in links.keys():

if not str(gloss_id).isdigit():
continue

gloss = Gloss.objects.filter(id=int(gloss_id)).first()
if not gloss:
continue

# Create the Synset objects and add them to the Gloss
for l in links[gloss_id]:

synset = Synset.objects.filter(name = l[0], glosses__lemma__dataset = ngt_dataset).first()

if not synset:
synset = Synset.objects.create(name = l[0])

# Check if lemma and description are available in the WordNet gloss file
if l[0] in wn_dict:
synset.lemmas = ', '.join(wn_dict[l[0]][0])
synset.description = wn_dict[l[0]][1]

# Check if the URL is valid
response = requests.get(l[3])
if response.status_code == 200:
synset.url = l[3]
# Lemmas and descriptions are not available in the WordNet gloss file, so scrape them from the HTML
if not synset.lemmas:
synset.lemmas, synset.description = self.find_lemmas_description_in_html(response.text)
synset.save()

if synset not in gloss.synsets.all():
gloss.synsets.add(synset)
gloss.save()

# Delete the Synset objects that are not in the new synsets (outdated)
Synset.objects.filter(glosses=None).delete()

print("WordNet links updated successfully.")


def handle(self, *args, **options):
nltk.download('wordnet')
nltk.download('omw')
glosses_wordnet = self.download_wordnet_gloss()
links_csv, signs_csv = self.download_links_csv()
links = self.get_links_data(links_csv, signs_csv)
wn_dict = self.get_lemma_definitions(glosses_wordnet)
self.update_links_data(links, wn_dict)

24 changes: 24 additions & 0 deletions signbank/dictionary/migrations/0087_synset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by Django 4.2.10 on 2024-12-04 11:49

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('dictionary', '0086_alter_dataset_options'),
]

operations = [
migrations.CreateModel(
name='Synset',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=200, verbose_name='Name')),
('lemmas', models.TextField(blank=True, verbose_name='Lemmas')),
('url', models.TextField(blank=True, verbose_name='Url')),
('description', models.TextField(blank=True, verbose_name='Description')),
('glosses', models.ManyToManyField(related_name='synsets', to='dictionary.gloss', verbose_name='Glosses')),
],
),
]
13 changes: 12 additions & 1 deletion signbank/dictionary/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4303,4 +4303,15 @@ def get_absolute_url(self):
parsed_url = urlparse(self.url)
if not parsed_url.scheme:
return 'http://' + self.url
return self.url
return self.url

class Synset(models.Model):
"""A synset is a set of glosses that are synonymous"""
name = models.CharField(max_length=200, verbose_name=_("Name"))
lemmas = models.TextField(blank=True, verbose_name=_("Lemmas"))
url = models.TextField(blank=True, verbose_name=_("Url"))
description = models.TextField(blank=True, verbose_name=_("Description"))
glosses = models.ManyToManyField(Gloss, related_name = 'synsets', verbose_name=_("Glosses"))

def __str__(self):
return self.name
19 changes: 19 additions & 0 deletions signbank/dictionary/templates/dictionary/gloss_detail.html
Original file line number Diff line number Diff line change
Expand Up @@ -1747,6 +1747,25 @@ <h2 id='modalTitleGloss'>{% trans "Add a sense" %}</h2>
</td>
</tr>

<tr>
<th>
Synsets
</th>
<td>
{% for synset in synsets %}
<span class="synset">
{% if synset.url %}
<a href='{{ synset.url }}'>{{synset.name}}</a>
{% else %}
{{synset.name}}
{% endif %}
{% if synset.lemmas %}
{{synset.lemmas}} | {{synset.description}}
{% endif %}
</span><br>
{% endfor %}
</tr>

{% for value,name,label,kind in main_fields %}
<tr><th>{{label}}</th>
{% if kind == 'text' %}
Expand Down

0 comments on commit 3a6f445

Please sign in to comment.