diff --git a/.gitignore b/.gitignore
index 1e85507..9e99452 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
venv/
en_core_web_sm-3.2.0/
huggingface_cache/
-**/__pycache__/
\ No newline at end of file
+**/__pycache__
+website/Uploaded/*
+!website/Uploaded/Reviews_Lexham_Gardens_London.xlsx
diff --git a/Keyword_collocation.py b/Keyword_collocation.py
index bd0ccc0..f883c6c 100644
--- a/Keyword_collocation.py
+++ b/Keyword_collocation.py
@@ -1,10 +1,11 @@
+import os
import string
import io
import re
import time
import requests
-from langdetect import detect
-import os
+from langdetect import detect
+import os
import nltk
from collections import Counter
import spacy
@@ -14,14 +15,14 @@
from pyvis.network import Network
from flask import current_app
import math
-STOPWORDS = set(["a", "an", "the", "and", "or", "in", "of", "to", "is", "it", "that", "on", "was", "for", "as", "with", "by"]) # Modify with actual stopwords
+STOPWORDS = set(["a", "an", "the", "and", "or", "in", "of", "to", "is", "it",
+ "that", "on", "was", "for", "as", "with", "by"]) # Modify with actual stopwords
PUNCS = string.punctuation
nlp = spacy.load('/freetxt/en_core_web_sm-3.2.0')
-PUNCS += '''!→()-[]{};:"\,<>./?@#$%&*_~.'''
-import time
-import os
+PUNCS += '''!→()-[]{};:"\,<>?@#$%&*_~.'''
+
def cleanup_old_graphs(subdirectory, age_in_seconds=20):
# Construct the path to the subdirectory within the static directory
@@ -46,101 +47,108 @@ def cleanup_old_graphs(subdirectory, age_in_seconds=20):
os.remove(file_path)
print(f"Removed old graph: {filename}")
+
class KWICAnalyser:
- def __init__(self, text_or_dataframe,language):
+ def __init__(self, text_or_dataframe, language):
# If the input is a DataFrame, extract text from its single column
if isinstance(text_or_dataframe, pd.DataFrame):
-
- self.text_rows = [self._preprocess_text(row) for row in text_or_dataframe.iloc[:, 0]]
+
+ self.text_rows = [self._preprocess_text(
+ row) for row in text_or_dataframe.iloc[:, 0]]
else:
# If the input is a single string
self.text_rows = [self._preprocess_text(text_or_dataframe)]
- self.pymusaslist = pd.read_csv('/freetxt/website/data/Pymusas-list.txt', names=['USAS Tags', 'Equivalent Tag'])
+ self.pymusaslist = pd.read_csv(
+ 'website/data/Pymusas-list.txt', names=['USAS Tags', 'Equivalent Tag'])
self.text = self.text_rows
-
- #self.tokens_with_tags = self._tag_text(self.text) # Use the preprocessed text for tagging
- self.lang_detected= language
+
+ # self.tokens_with_tags = self._tag_text(self.text) # Use the preprocessed text for tagging
+ self.lang_detected = language
self.welsh_pos_mapping = {
- "E": "NOUN",
- "E": "PROPN",
- "YFB": "art",
- "YFB": "det",
- "Ar": "prep",
- "Cys": "conj",
- "Rhi": "NUM",
- "Ans": "ADJ",
- "Adf": "ADV",
- "B": "VERB",
- "Rha": "pron",
- "U": "part",
- "Ebych": "intj",
- "Gw": "xx",
- "Gw": "fw",
- "Gw": "code",
- "Atd": "punc",
- }
-
- self.claws_c7_mapping = {
- 'NOUN': ['NN', 'NN1', 'NN2', 'NNJ', 'NNK', 'NN@', 'NNS', 'NN', 'NN1', 'NN2', 'NNA', 'NNB', 'NNB1', 'NNB2', 'NND1', 'NND2', 'NNL1', 'NNL2', 'NNM1', 'NNM2', 'NNO', 'NNO1', 'NNO2', 'NNT1', 'NNT2', 'NNU', 'NNU1', 'NNU2', 'NNV', 'NNV1', 'NNV2'],
- 'PROPN': ['NP', 'NP1', 'NP2', 'NPD', 'NPM', 'NP', 'NP1', 'NP2', 'NPD1', 'NPD2', 'NPM1', 'NPM2'],
- 'VERB': ['VBB', 'VBD', 'VBG', 'VBI', 'VBM', 'VBN', 'VBZ', 'VDB', 'VDD', 'VDG', 'VDI', 'VDN', 'VDZ', 'VHB', 'VHD', 'VHG', 'VHI', 'VHN', 'VHZ', 'VM0', 'VVB', 'VVD', 'VVG', 'VVN', 'VVI', 'VVZ', 'VB0', 'VBDR', 'VBDZ', 'VBG', 'VBI', 'VBM', 'VBN', 'VBZ', 'VDB', 'VDD', 'VDG', 'VDI', 'VDN', 'VDZ', 'VHB', 'VHD', 'VHG', 'VHI', 'VHN', 'VHZ', 'VM0', 'VVB', 'VVD', 'VVG', 'VVN', 'VVI', 'VVZ'],
- 'ADJ': ['JJ', 'JJ@', 'JJR', 'JJT', 'JJ', 'JJR', 'JJT'],
- 'ADV': ['RA', 'RE', 'RG', 'RL', 'RP', 'RP@', 'RPA', 'RPH', 'RPK', 'RR', 'RR@', 'RRQ', 'RRR', 'RRT', 'RA', 'RE', 'REX', 'RG', 'RGQ', 'RGQV', 'RGR', 'RGT', 'RL', 'RP', 'RPK', 'RR', 'RRQ', 'RRR', 'RRT', 'RT'],
- 'NUM': ['MC', 'MC1', 'MC2', 'MD', 'MD1', 'MD2', 'MF', 'MF1', 'MF2', 'MH', 'MH1', 'MH2', 'MI', 'MI1', 'MI2', 'MN', 'MN1', 'MN2', 'MO', 'MO1', 'MO2', 'MP', 'MP1', 'MP2', 'MS', 'MS1', 'MS2', 'MT', 'MT1', 'MT2', 'MC', 'MC1', 'MC2', 'MD', 'MD1', 'MD2', 'MF', 'MF1', 'MF2'],
- 'pronouns': ['PNP', 'PNQ', 'PNX', 'PPGE', 'PPH1', 'PPHO1', 'PPHO2', 'PPHS1', 'PPHS2', 'PPY', 'PN', 'PN1', 'PNQ', 'PNQO', 'PNQS', 'PNQV', 'PNX1', 'PNX2', 'PPGE', 'PPH1', 'PPHO1', 'PPHO2', 'PPHS1', 'PPHS2', 'PPIO1', 'PPIO2', 'PPIS1', 'PPIS2', 'PPX1', 'PPX2', 'PPY'],
- 'determiners': ['DA', 'DAR', 'DAT', 'DB', 'DB2', 'DD', 'DD1', 'DD2', 'DDQ', 'DDQGE', 'DDQV', 'DT', 'DT0', 'DTQ', 'AT', 'AT1', 'DA2', 'DB', 'DB2', 'DD', 'DD1', 'DD2', 'DDQ', 'DDQGE', 'DDQV', 'DT', 'DT0', 'DTQ'],
- 'prepositions': ['II', 'IO', 'IW', 'IF', 'II', 'IO', 'IW', 'TO'],
- 'conjunctions': ['CC', 'CS', 'CCB', 'CSA', 'CST', 'CSW'],
- 'interjections': ['UH'],
- 'modal_verbs': ['VM'],
- 'negations': ['XX'],
- 'symbols': ['ZZ', 'ZZ1', 'ZZ2'],
- 'foreign_words': ['FW'],
- 'existential_there': ['EX'],
- 'general': ['FO', 'FU', 'FW', 'GE', 'JK', 'ND']}
-
-
-
- self.tokens_with_semantic_tags = self.tag_semantics(self.text) # Store the semantic tags
- print(len(self.tokens_with_semantic_tags))
-
- self.PUNCS = [".", ",", "!", ":", ";", "-", "_", "?", "&", "*", "(", ")", "$", "@", "#", "%", "^", "+", "=", "<", ">", "/", "|", "]", "[", "{", "}", "\\", "'", "\""]
- #self.sementic_tags = pd.read_csv('website/data/Cy_tags.csv')
-
+ "E": "NOUN",
+ "E": "PROPN",
+ "YFB": "art",
+ "YFB": "det",
+ "Ar": "prep",
+ "Cys": "conj",
+ "Rhi": "NUM",
+ "Ans": "ADJ",
+ "Adf": "ADV",
+ "B": "VERB",
+ "Rha": "pron",
+ "U": "part",
+ "Ebych": "intj",
+ "Gw": "xx",
+ "Gw": "fw",
+ "Gw": "code",
+ "Atd": "punc",
+ }
+ self.claws_c7_mapping = {
+ 'NOUN': ['NN', 'NN1', 'NN2', 'NNJ', 'NNK', 'NN@', 'NNS', 'NN', 'NN1', 'NN2', 'NNA', 'NNB', 'NNB1', 'NNB2', 'NND1', 'NND2', 'NNL1', 'NNL2', 'NNM1', 'NNM2', 'NNO', 'NNO1', 'NNO2', 'NNT1', 'NNT2', 'NNU', 'NNU1', 'NNU2', 'NNV', 'NNV1', 'NNV2'],
+ 'PROPN': ['NP', 'NP1', 'NP2', 'NPD', 'NPM', 'NP', 'NP1', 'NP2', 'NPD1', 'NPD2', 'NPM1', 'NPM2'],
+ 'VERB': ['VBB', 'VBD', 'VBG', 'VBI', 'VBM', 'VBN', 'VBZ', 'VDB', 'VDD', 'VDG', 'VDI', 'VDN', 'VDZ', 'VHB', 'VHD', 'VHG', 'VHI', 'VHN', 'VHZ', 'VM0', 'VVB', 'VVD', 'VVG', 'VVN', 'VVI', 'VVZ', 'VB0', 'VBDR', 'VBDZ', 'VBG', 'VBI', 'VBM', 'VBN', 'VBZ', 'VDB', 'VDD', 'VDG', 'VDI', 'VDN', 'VDZ', 'VHB', 'VHD', 'VHG', 'VHI', 'VHN', 'VHZ', 'VM0', 'VVB', 'VVD', 'VVG', 'VVN', 'VVI', 'VVZ'],
+ 'ADJ': ['JJ', 'JJ@', 'JJR', 'JJT', 'JJ', 'JJR', 'JJT'],
+ 'ADV': ['RA', 'RE', 'RG', 'RL', 'RP', 'RP@', 'RPA', 'RPH', 'RPK', 'RR', 'RR@', 'RRQ', 'RRR', 'RRT', 'RA', 'RE', 'REX', 'RG', 'RGQ', 'RGQV', 'RGR', 'RGT', 'RL', 'RP', 'RPK', 'RR', 'RRQ', 'RRR', 'RRT', 'RT'],
+ 'NUM': ['MC', 'MC1', 'MC2', 'MD', 'MD1', 'MD2', 'MF', 'MF1', 'MF2', 'MH', 'MH1', 'MH2', 'MI', 'MI1', 'MI2', 'MN', 'MN1', 'MN2', 'MO', 'MO1', 'MO2', 'MP', 'MP1', 'MP2', 'MS', 'MS1', 'MS2', 'MT', 'MT1', 'MT2', 'MC', 'MC1', 'MC2', 'MD', 'MD1', 'MD2', 'MF', 'MF1', 'MF2'],
+ 'pronouns': ['PNP', 'PNQ', 'PNX', 'PPGE', 'PPH1', 'PPHO1', 'PPHO2', 'PPHS1', 'PPHS2', 'PPY', 'PN', 'PN1', 'PNQ', 'PNQO', 'PNQS', 'PNQV', 'PNX1', 'PNX2', 'PPGE', 'PPH1', 'PPHO1', 'PPHO2', 'PPHS1', 'PPHS2', 'PPIO1', 'PPIO2', 'PPIS1', 'PPIS2', 'PPX1', 'PPX2', 'PPY'],
+ 'determiners': ['DA', 'DAR', 'DAT', 'DB', 'DB2', 'DD', 'DD1', 'DD2', 'DDQ', 'DDQGE', 'DDQV', 'DT', 'DT0', 'DTQ', 'AT', 'AT1', 'DA2', 'DB', 'DB2', 'DD', 'DD1', 'DD2', 'DDQ', 'DDQGE', 'DDQV', 'DT', 'DT0', 'DTQ'],
+ 'prepositions': ['II', 'IO', 'IW', 'IF', 'II', 'IO', 'IW', 'TO'],
+ 'conjunctions': ['CC', 'CS', 'CCB', 'CSA', 'CST', 'CSW'],
+ 'interjections': ['UH'],
+ 'modal_verbs': ['VM'],
+ 'negations': ['XX'],
+ 'symbols': ['ZZ', 'ZZ1', 'ZZ2'],
+ 'foreign_words': ['FW'],
+ 'existential_there': ['EX'],
+ 'general': ['FO', 'FU', 'FW', 'GE', 'JK', 'ND']}
+
+ self.tokens_with_semantic_tags = self.tag_semantics(
+ self.text) # Store the semantic tags
+ # print(len(self.tokens_with_semantic_tags))
+ # print(self.tokens_with_semantic_tags)
+
+ self.PUNCS = [".", ",", "!", ":", ";", "-", "_", "?", "&", "*",
+ "(", ")", "$", "@", "#", "%", "^", "+", "=", "<", ">", "/", "|", "]", "[", "{", "}", "\\", "'", "\""]
+ # self.sementic_tags = pd.read_csv('website/data/Cy_tags.csv')
def _preprocess_text(self, text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
- #text = ''.join([char for char in text if char not in PUNCS])
+ # text = ''.join([char for char in text if char not in PUNCS])
# Remove stopwords
- #text = " ".join(word for word in text.split() if word not in STOPWORDS)
+ # text = " ".join(word for word in text.split() if word not in STOPWORDS)
#
return text
+
def get_kwic(self, keyword, window_size=5, max_instances=30, lower_case=False, by_tag=False, by_sem=False):
-
- tokens = [token for token,pos, tag in self.tokens_with_semantic_tags]
+
+ tokens = [token for token, pos, tag in self.tokens_with_semantic_tags]
if by_tag or by_sem:
if by_tag:
-
- keyword_indexes = [i for i, (token,pos, tag) in enumerate(self.tokens_with_semantic_tags) if pos == keyword]
+
+ keyword_indexes = [i for i, (token, pos, tag) in enumerate(
+ self.tokens_with_semantic_tags) if pos == keyword]
else: # by_sem
-
- keyword_indexes = [i for i, (token,pos, tag) in enumerate(self.tokens_with_semantic_tags) if tag == keyword]
-
+
+ keyword_indexes = [i for i, (token, pos, tag) in enumerate(
+ self.tokens_with_semantic_tags) if tag == keyword]
+
else:
if lower_case:
keyword = keyword.lower()
- keyword_indexes = [i for i, token in enumerate(tokens) if token.lower() == keyword]
+ keyword_indexes = [i for i, token in enumerate(
+ tokens) if token.lower() == keyword]
else:
- keyword_indexes = [i for i, token in enumerate(tokens) if token.lower() == keyword.lower()]
+ keyword_indexes = [i for i, token in enumerate(
+ tokens) if token.lower() == keyword.lower()]
if not keyword_indexes:
-
+
return []
-
- #print("Tokens:", tokens)
+
+ # print("Tokens:", tokens)
# Identify sentence boundaries
sentence_boundaries = []
start = None
@@ -162,35 +170,39 @@ def get_kwic(self, keyword, window_size=5, max_instances=30, lower_case=False, b
elif by_sem:
match = sem_tag == keyword
else:
- match = (token.lower() if lower_case else token) == keyword.lower()
+ match = (token.lower()
+ if lower_case else token) == keyword.lower()
if match:
left_context_index = max(i - window_size, 0)
- right_context_index = min(i + window_size + 1, len(sentence_tokens))
- left_context = ' '.join(t for t, _, _ in self.tokens_with_semantic_tags[start+left_context_index:start+i] if t not in ['stt', 'edd'])
- right_context = ' '.join(t for t, _, _ in self.tokens_with_semantic_tags[start+i+1:start+right_context_index] if t not in ['stt', 'edd'])
+ right_context_index = min(
+ i + window_size + 1, len(sentence_tokens))
+ left_context = ' '.join(
+ t for t, _, _ in self.tokens_with_semantic_tags[start+left_context_index:start+i] if t not in ['stt', 'edd'])
+ right_context = ' '.join(
+ t for t, _, _ in self.tokens_with_semantic_tags[start+i+1:start+right_context_index] if t not in ['stt', 'edd'])
kwic_insts.append((left_context, token, right_context))
if len(kwic_insts) >= max_instances:
return kwic_insts
return kwic_insts
-
def get_top_n_words(self, remove_stops=False, topn=30):
- text_tokens = [word for word in self.text.split() if word not in STOPWORDS] if remove_stops else self.text.split()
+ text_tokens = [word for word in self.text.split(
+ ) if word not in STOPWORDS] if remove_stops else self.text.split()
return Counter(text_tokens).most_common(topn)
def get_collocs(self, kwic_insts, topn=30):
words = []
for l, t, r in kwic_insts:
- # consider only the word immediately before the keyword
+ # consider only the word immediately before the keyword
if l.split():
words.append(l.split()[-1])
# consider only the word immediately after the keyword
if r.split():
words.append(r.split()[0])
all_words = [word for word in words if word not in STOPWORDS]
- collocs =Counter(all_words).most_common(topn)
+ collocs = Counter(all_words).most_common(topn)
return collocs
def plot_coll_14(self, keyword, collocs, output_file='network.html'):
@@ -198,26 +210,30 @@ def plot_coll_14(self, keyword, collocs, output_file='network.html'):
top_collocs_df = pd.DataFrame(collocs, columns=['word', 'freq'])
visualisation_text_en = "Visualisation by"
visualisation_text_cy = "Gweledigaeth gan"
+ print("lang_detected", self.lang_detected)
if self.lang_detected == 'en':
visualisation_text = visualisation_text_en
elif self.lang_detected == 'cy':
- visualisation_text = visualisation_text_cy
+ visualisation_text = visualisation_text_cy
top_collocs_df.insert(1, 'source', keyword)
- top_collocs_df = top_collocs_df[top_collocs_df['word'] != keyword] # remove row where keyword == word
- G = nx.from_pandas_edgelist(top_collocs_df, source='source', target='word', edge_attr='freq')
+ # remove row where keyword == word
+ top_collocs_df = top_collocs_df[top_collocs_df['word'] != keyword]
+ G = nx.from_pandas_edgelist(
+ top_collocs_df, source='source', target='word', edge_attr='freq')
n = max(counts)
- print(G.nodes())
+ # print(G.nodes())
most_frequent_word = max(collocs, key=lambda x: x[1])[0]
net = Network(notebook=True, height='750px', width='100%')
gravity = -200 * n / sum(counts)
- net.barnes_hut(gravity=gravity* 30)
+ net.barnes_hut(gravity=gravity * 30)
for node, count in zip(G.nodes(), counts):
node_color = 'green' if node == most_frequent_word else 'gray' if node == keyword else 'blue'
node_size = 80 * count / n
font_size = max(6, int(node_size / 2))
- net.add_node(node, label=node, title=node, color=node_color, size=node_size, font={'size': font_size, 'face': 'Arial'})
+ net.add_node(node, label=node, title=node, color=node_color,
+ size=node_size, font={'size': font_size, 'face': 'Arial'})
for source, target, freq in top_collocs_df[['source', 'word', 'freq']].values:
if source in net.get_nodes() and target in net.get_nodes():
@@ -225,9 +241,9 @@ def plot_coll_14(self, keyword, collocs, output_file='network.html'):
cleanup_old_graphs("/freetxt/website/static/network_graphs")
timestamp = int(time.time())
-
-
- graph_folder = "/freetxt/website/static/network_graphs"
+
+ graph_folder = os.path.join(
+ current_app.static_folder, "network_graphs")
filename = f"network_{timestamp}.html"
graph_path = os.path.join(graph_folder, filename)
@@ -235,11 +251,12 @@ def plot_coll_14(self, keyword, collocs, output_file='network.html'):
original_working_dir = os.getcwd()
os.chdir(graph_folder)
net.save_graph(graph_path)
- os.chdir(original_working_dir) # Revert to the original working directory
+ # Revert to the original working directory
+ os.chdir(original_working_dir)
# Read the saved HTML file
with open(graph_path, "r", encoding='utf-8') as file:
- html = file.read()
+ html = file.read()
# Add the "Visualisation by" text and logo
addition = f"""
@@ -251,12 +268,11 @@ def plot_coll_14(self, keyword, collocs, output_file='network.html'):
# Write the updated HTML content back to the file
with open(graph_path, "w", encoding='utf-8') as file:
- file.write(html)
+ file.write(html)
# Return the relative path to the saved graph
return f"/static/network_graphs/{filename}"
-
def _compute_mi_and_ll(self, word_freq, coll_freq, joint_freq):
"""
Computes the Mutual Information (MI) and Log-Likelihood (LL) values for a given word and collocate.
@@ -276,10 +292,16 @@ def _compute_mi_and_ll(self, word_freq, coll_freq, joint_freq):
O22 = len(self.tokens_with_semantic_tags) - O11 - O12 - O21
# Expected frequencies
- E11 = (word_freq * coll_freq) / len(self.tokens_with_semantic_tags) if len(self.tokens_with_semantic_tags) != 0 else 0
- E12 = (word_freq * (len(self.tokens_with_semantic_tags) - coll_freq)) / len(self.tokens_with_semantic_tags) if len(self.tokens_with_semantic_tags) != 0 else 0
- E21 = ((len(self.tokens_with_semantic_tags) - word_freq) * coll_freq) / len(self.tokens_with_semantic_tags) if len(self.tokens_with_semantic_tags) != 0 else 0
- E22 = ((len(self.tokens_with_semantic_tags) - word_freq) * (len(self.tokens_with_semantic_tags) - coll_freq)) / len(self.tokens_with_semantic_tags) if len(self.tokens_with_semantic_tags) != 0 else 0
+ E11 = (word_freq * coll_freq) / len(self.tokens_with_semantic_tags) if len(
+ self.tokens_with_semantic_tags) != 0 else 0
+ E12 = (word_freq * (len(self.tokens_with_semantic_tags) - coll_freq)) / \
+ len(self.tokens_with_semantic_tags) if len(
+ self.tokens_with_semantic_tags) != 0 else 0
+ E21 = ((len(self.tokens_with_semantic_tags) - word_freq) * coll_freq) / \
+ len(self.tokens_with_semantic_tags) if len(
+ self.tokens_with_semantic_tags) != 0 else 0
+ E22 = ((len(self.tokens_with_semantic_tags) - word_freq) * (len(self.tokens_with_semantic_tags) -
+ coll_freq)) / len(self.tokens_with_semantic_tags) if len(self.tokens_with_semantic_tags) != 0 else 0
# Mutual Information calculation
MI = math.log2(O11/E11) if O11 > 0 and E11 > 0 else 0
@@ -289,11 +311,12 @@ def _compute_mi_and_ll(self, word_freq, coll_freq, joint_freq):
LL_12 = O12 * math.log(O12/E12) if O12 > 0 and E12 != 0 else 0
LL_21 = O21 * math.log(O21/E21) if O21 > 0 and E21 != 0 else 0
LL_22 = O22 * math.log(O22/E22) if O22 > 0 and E22 != 0 else 0
-
+
LL2 = 2 * (LL_11 + LL_12 + LL_21 + LL_22)
LL2 = max(0, LL2)
- return round(MI,2), round(LL2,2)
+ return round(MI, 2), round(LL2, 2)
+
def get_collocation_strength(self, keyword, topn=30, window_size=5, by_tag=False, by_sem=False):
"""
Computes collocations around a keyword based on strength using MI and LL.
@@ -310,50 +333,54 @@ def get_collocation_strength(self, keyword, topn=30, window_size=5, by_tag=False
by MI and then LL in descending order.
"""
- # Get KWIC instances
- print('keyword',keyword)
- kwic_insts = self.get_kwic(keyword, window_size=window_size, max_instances=100000, by_tag=by_tag, by_sem=by_sem)
-
+ # Get KWIC instances
+ print('keyword', keyword)
+ kwic_insts = self.get_kwic(
+ keyword, window_size=window_size, max_instances=100000, by_tag=by_tag, by_sem=by_sem)
+
# Compute the collocates
collocs = self.get_collocs(kwic_insts)
-
# Compute the word and collocate frequencies
if by_sem:
- word_freq = sum(1 for token, pos, tag in self.tokens_with_semantic_tags if tag == keyword)
+ word_freq = sum(
+ 1 for token, pos, tag in self.tokens_with_semantic_tags if tag == keyword)
elif by_tag:
- word_freq = sum(1 for token, pos, tag in self.tokens_with_semantic_tags if pos == keyword)
+ word_freq = sum(
+ 1 for token, pos, tag in self.tokens_with_semantic_tags if pos == keyword)
else:
- word_freq = sum(1 for token, pos, tag in self.tokens_with_semantic_tags if token.lower() == keyword.lower())
-
+ word_freq = sum(
+ 1 for token, pos, tag in self.tokens_with_semantic_tags if token.lower() == keyword.lower())
+
# Compute MI and LL values for each collocate
results = []
for collocate, joint_freq in collocs:
- coll_freq = sum(1 for token,pos, tag in self.tokens_with_semantic_tags if token.lower() == collocate.lower())
+ coll_freq = sum(1 for token, pos, tag in self.tokens_with_semantic_tags if token.lower(
+ ) == collocate.lower())
MI, LL2 = self._compute_mi_and_ll(word_freq, coll_freq, joint_freq)
results.append((collocate, MI, LL2))
-
# Sort results by MI and LL in descending order
return sorted(results, key=lambda x: (-x[1], -x[2]))
-
def tag_semantics(self, text):
"""
Function to tag the input text with semantic tags using the Pymsas API.
-
+
"""
- # Concatenate all texts with unique start and end tags
- concatenated_text = " [stt.] " + " [edd.] [stt.] ".join(self.text_rows) + " [edd.] "
- text= concatenated_text
+ # Concatenate all texts with unique start and end tags
+ concatenated_text = " [stt.] " + \
+ " [edd.] [stt.] ".join(self.text_rows) + " [edd.] "
+ text = concatenated_text
# Detect the language of the text
- # Detect the language of the concatenated text
+ # Detect the language of the concatenated text
try:
lang_detected = detect(concatenated_text)
except:
lang_detected = 'en'
- tags_to_remove = [ 'Grammatical bin', 'Pronouns', 'Period', 'Being','Discourse Bin','Negative','PUNCT']
+ tags_to_remove = ['Grammatical bin', 'Pronouns',
+ 'Period', 'Being', 'Discourse Bin', 'Negative', 'PUNCT']
if lang_detected == 'cy':
text = text.strip().replace('‘', "'").replace('’', "'")
@@ -364,104 +391,117 @@ def tag_semantics(self, text):
'lang': (None, 'cy'),
'text': text,
}
- response = requests.post('http://ucrel-api-02.lancaster.ac.uk/cgi-bin/pymusas.pl', files=files)
-
+ response = requests.post(
+ 'http://ucrel-api-02.lancaster.ac.uk/cgi-bin/pymusas.pl', files=files)
+
# Read the response into a DataFrame
cy_tagged = pd.read_csv(io.StringIO(response.text), sep='\t')
-
- cy_tagged['USAS Tags'] = cy_tagged['USAS Tags'].str.split('[,/mf]').str[0].str.replace('[\[\]"\']', '', regex=True)
- cy_tagged['USAS Tags'] = cy_tagged['USAS Tags'].str.replace('([A-Za-z]+\d+).*', r'\1', regex=True) # Remove characters following the pattern 'letter + number'
-
- cy_tagged['USAS Tags'] = cy_tagged['USAS Tags'].str.split('+').str[0]
- sementic_tags = pd.read_csv('/freetxt/website/data/Welsh_pymusas_list.csv')
- merged_df = pd.merge(cy_tagged, sementic_tags, on='USAS Tags', how='left')
-
- merged_df.loc[merged_df['Equivalent_Tag'].notnull(), 'USAS Tags'] = merged_df['Equivalent_Tag']
- #merged_df = merged_df.drop(['Equivalent Tag'], axis=1)
- # merged_df = merged_df[~merged_df['USAS Tags'].isin(tags_to_remove)]
- merged_df = merged_df[merged_df['USAS Tags'].notnull()]
- merged_df['POS'] = merged_df['POS'].map(self.welsh_pos_mapping).fillna('unknown')
-
-
-
-
+ cy_tagged['USAS Tags'] = cy_tagged['USAS Tags'].str.split(
+ '[,/mf]').str[0].str.replace('[\[\]"\']', '', regex=True)
+ # Remove characters following the pattern 'letter + number'
+ cy_tagged['USAS Tags'] = cy_tagged['USAS Tags'].str.replace(
+ '([A-Za-z]+\d+).*', r'\1', regex=True)
+
+ cy_tagged['USAS Tags'] = cy_tagged['USAS Tags'].str.split(
+ '+').str[0]
+ sementic_tags = pd.read_csv('website/data/Welsh_pymusas_list.csv')
+ merged_df = pd.merge(cy_tagged, sementic_tags,
+ on='USAS Tags', how='left')
+
+ merged_df.loc[merged_df['Equivalent_Tag'].notnull(
+ ), 'USAS Tags'] = merged_df['Equivalent_Tag']
+ # merged_df = merged_df.drop(['Equivalent Tag'], axis=1)
+ # merged_df = merged_df[~merged_df['USAS Tags'].isin(tags_to_remove)]
+ merged_df = merged_df[merged_df['USAS Tags'].notnull()]
+ merged_df['POS'] = merged_df['POS'].map(
+ self.welsh_pos_mapping).fillna('unknown')
elif lang_detected == 'en':
text = text.translate(str.maketrans('', '', PUNCS))
files = {
- 'type': (None, 'rest'),
- 'email': (None, 'hello'),
- 'tagset': (None, 'c7'),
- 'style': (None, 'tab'),
- 'text': (None, text),
- }
+ 'type': (None, 'rest'),
+ 'email': (None, 'hello'),
+ 'tagset': (None, 'c7'),
+ 'style': (None, 'tab'),
+ 'text': (None, text),
+ }
- response = requests.post('http://ucrel-api-02.lancaster.ac.uk/cgi-bin/usas.pl', files=files)
+ response = requests.post(
+ 'http://ucrel-api-02.lancaster.ac.uk/cgi-bin/usas.pl', files=files)
# Column names
columns = ['Text', 'POS', 'Lemma', 'USAS Tags']
# Filter out the unwanted lines
- cleaned_lines = [line for line in response.text.splitlines() if line and not line.startswith('<')]
+ cleaned_lines = [line for line in response.text.splitlines(
+ ) if line and not line.startswith('<')]
# Convert the list back to a string and read it into a DataFrame
cleaned_text = '\n'.join(cleaned_lines)
-
- en_tagged = pd.read_csv(io.StringIO(cleaned_text), sep='\t', names=columns, header=None)
- # Split the tags and keep only the first tag
- en_tagged['USAS Tags'] = en_tagged['USAS Tags'].str.split().str[0]
+ print("response from ucrel")
+ print(cleaned_text)
-
-
+ en_tagged = pd.read_csv(io.StringIO(
+ cleaned_text), sep='\t', names=columns, header=None)
- en_tagged['USAS Tags'] = en_tagged['USAS Tags'].str.replace(r'\[.*', '', regex=True) # Remove everything after [
- en_tagged['USAS Tags'] = en_tagged['USAS Tags'].str.replace(r'\/.*', '', regex=True) # Remove everything after /
- en_tagged['USAS Tags'] = en_tagged['USAS Tags'].str.replace(r'(\d)[A-Za-z]+', r'\1', regex=True)
- en_tagged['USAS Tags'] = en_tagged['USAS Tags'].str.replace(r'(\+|-)[A-Za-z+-]*', r'\1', regex=True)
+ # Split the tags and keep only the first tag
+ en_tagged['USAS Tags'] = en_tagged['USAS Tags'].str.split().str[0]
- merged_df = pd.merge(en_tagged, self.pymusaslist, on='USAS Tags', how='left')
-
-
-
- merged_df.loc[merged_df['Equivalent Tag'].notnull(), 'USAS Tags'] = merged_df['Equivalent Tag']
+ # Remove everything after [
+ en_tagged['USAS Tags'] = en_tagged['USAS Tags'].str.replace(
+ r'\[.*', '', regex=True)
+ en_tagged['USAS Tags'] = en_tagged['USAS Tags'].str.replace(
+ r'\/.*', '', regex=True) # Remove everything after /
+ en_tagged['USAS Tags'] = en_tagged['USAS Tags'].str.replace(
+ r'(\d)[A-Za-z]+', r'\1', regex=True)
+ en_tagged['USAS Tags'] = en_tagged['USAS Tags'].str.replace(
+ r'(\+|-)[A-Za-z+-]*', r'\1', regex=True)
+
+ merged_df = pd.merge(en_tagged, self.pymusaslist,
+ on='USAS Tags', how='left')
+
+ merged_df.loc[merged_df['Equivalent Tag'].notnull(
+ ), 'USAS Tags'] = merged_df['Equivalent Tag']
merged_df = merged_df.drop(['Equivalent Tag'], axis=1)
- #merged_df = merged_df[~merged_df['USAS Tags'].isin(tags_to_remove)]
+ # merged_df = merged_df[~merged_df['USAS Tags'].isin(tags_to_remove)]
merged_df = merged_df[merged_df['USAS Tags'].notnull()]
pos_tag_mapping = {}
for broader_category, pos_tags in self.claws_c7_mapping.items():
for pos_tag in pos_tags:
pos_tag_mapping[pos_tag] = broader_category
merged_df['POS'] = merged_df['POS'].replace(pos_tag_mapping)
-
- tokens_with_semantic_tags = [(row['Text'], row['POS'], row['USAS Tags']) for _, row in merged_df.iterrows()]
- print(tokens_with_semantic_tags)
-
+
+ tokens_with_semantic_tags = [
+ (row['Text'], row['POS'], row['USAS Tags']) for _, row in merged_df.iterrows()]
+
return tokens_with_semantic_tags
-
+
def update_graph(keyword, collocs, graph_type, output_file='network.html'):
- # Convert collocs to the required format for your graph function
- words, values = zip(*[(item['word'], item[graph_type]) for item in collocs])
-
+ # Convert collocs to the required format for your graph function
+ words, values = zip(*[(item['word'], item[graph_type])
+ for item in collocs])
+
# Creating a DataFrame
top_collocs_df = pd.DataFrame({
'word': words,
graph_type: values
})
-
+
# Inserting the 'source' column with the keyword
top_collocs_df.insert(1, 'source', keyword)
-
+
# Removing rows where keyword == word
top_collocs_df = top_collocs_df[top_collocs_df['word'] != keyword]
-
+
# Creating the network graph
- G = nx.from_pandas_edgelist(top_collocs_df, source='source', target='word', edge_attr=graph_type)
-
+ G = nx.from_pandas_edgelist(
+ top_collocs_df, source='source', target='word', edge_attr=graph_type)
+
n = max(values)
most_frequent_word = max(collocs, key=lambda x: x[graph_type])['word']
-
+
# Create the network graph
net = Network(notebook=True, height='750px', width='100%')
gravity = -200 * n / sum(values)
@@ -471,7 +511,8 @@ def update_graph(keyword, collocs, graph_type, output_file='network.html'):
fixed_main_node_size = 60
# Add the main node
- net.add_node(keyword, label=keyword, title=keyword, color='gray', size=fixed_main_node_size, font={'size': 32, 'face': 'Arial'})
+ net.add_node(keyword, label=keyword, title=keyword, color='gray',
+ size=fixed_main_node_size, font={'size': 32, 'face': 'Arial'})
# Adding nodes and edges in a single loop
for _, row in top_collocs_df.iterrows():
@@ -483,8 +524,10 @@ def update_graph(keyword, collocs, graph_type, output_file='network.html'):
else:
node_size = 30 * value / n
font_size = max(6, int(node_size / 2))
- net.add_node(node, label=node, title=node, color=node_color, size=node_size, font={'size': font_size, 'face': 'Arial'})
- net.add_edge(keyword, node, value=value, color='grey') # Set edge color to grey
+ net.add_node(node, label=node, title=node, color=node_color,
+ size=node_size, font={'size': font_size, 'face': 'Arial'})
+ # Set edge color to grey
+ net.add_edge(keyword, node, value=value, color='grey')
for source, target, value in top_collocs_df[['source', 'word', graph_type]].values:
if source in net.get_nodes() and target in net.get_nodes():
@@ -492,48 +535,54 @@ def update_graph(keyword, collocs, graph_type, output_file='network.html'):
timestamp = int(time.time())
graph_folder = "/freetxt/website/static/network_graphs"
filename = f"network_{timestamp}.html"
-
+
graph_path = os.path.join(graph_folder, filename)
-
original_working_dir = os.getcwd()
os.chdir(graph_folder)
-
-
+
net.save_graph(graph_path)
# Print the path of the saved graph in the temp directory for debugging
- #print(graph_path)
+ # print(graph_path)
# Revert to the original working directory
os.chdir(original_working_dir)
# Return the relative path to the saved graph
return f"/static/network_graphs/{filename}"
+
def get_sorted_unique_tags(self):
- # Assuming self.tokens_with_semantic_tags is a list of tuples (token, semantic_tag)
- semantic_tags = [tag for token,pos, tag in self.tokens_with_semantic_tags]
+ # Assuming self.tokens_with_semantic_tags is a list of tuples (token, semantic_tag)
+ semantic_tags = [tag for token, pos,
+ tag in self.tokens_with_semantic_tags]
sorted_unique_tags = sorted(set(map(str, semantic_tags)))
-
+
return sorted_unique_tags
def filter_words(self, word_tag_pairs):
"""Return a list with stopwords, punctuation, and specific tags removed."""
-
- specific_tokens_to_remove = {'stt', 'edd'} # Add any specific tokens to remove here
- tags_to_remove = {'Grammatical bin', 'Pronouns', 'Period', 'Being', 'Discourse Bin', 'Negative', 'PUNCT', 'Unmatched'} # Add any specific tags to remove here
+
+ # Add any specific tokens to remove here
+ specific_tokens_to_remove = {'stt', 'edd'}
+ tags_to_remove = {'Grammatical bin', 'Pronouns', 'Period', 'Being', 'Discourse Bin',
+ 'Negative', 'PUNCT', 'Unmatched'} # Add any specific tags to remove here
return [word for word, tag in word_tag_pairs if str(word) not in STOPWORDS and str(word) not in PUNCS and str(word) not in specific_tokens_to_remove and tag not in tags_to_remove]
- def get_word_frequencies(self):
+ def get_word_frequencies(self, isUnfiltered=False):
# Extract tokens and their tags from the tuples
- token_tag_pairs = [(token, tag) for token, pos, tag in self.tokens_with_semantic_tags]
-
- # Apply the filter to remove stopwords, punctuation, and specific tags
- filtered_tokens = self.filter_words(token_tag_pairs)
+ token_tag_pairs = [(token, tag)
+ for token, pos, tag in self.tokens_with_semantic_tags]
+ if isUnfiltered:
+ # Leave unfiltered
+ tokens = [str(word) for word, tag in token_tag_pairs]
+ else:
+ # Apply the filter to remove stopwords, punctuation, and specific tags
+ tokens = self.filter_words(token_tag_pairs)
# Calculate word frequencies using nltk.FreqDist
- fdist = nltk.FreqDist(filtered_tokens)
+ fdist = nltk.FreqDist(tokens)
- # Convert the frequency distribution to a dictionary
- word_frequencies = dict(fdist)
+ # Convert the frequency distribution to a dictionary, keys typed to strings to avoid comparison errors
+ word_frequencies = {str(word): freq for word, freq in fdist.items()}
return word_frequencies
diff --git a/lib/bindings/utils.js b/lib/bindings/utils.js
new file mode 100644
index 0000000..088effe
--- /dev/null
+++ b/lib/bindings/utils.js
@@ -0,0 +1,189 @@
+function neighbourhoodHighlight(params) {
+ // console.log("in nieghbourhoodhighlight");
+ allNodes = nodes.get({ returnType: "Object" });
+ // originalNodes = JSON.parse(JSON.stringify(allNodes));
+ // if something is selected:
+ if (params.nodes.length > 0) {
+ highlightActive = true;
+ var i, j;
+ var selectedNode = params.nodes[0];
+ var degrees = 2;
+
+ // mark all nodes as hard to read.
+ for (let nodeId in allNodes) {
+ // nodeColors[nodeId] = allNodes[nodeId].color;
+ allNodes[nodeId].color = "rgba(200,200,200,0.5)";
+ if (allNodes[nodeId].hiddenLabel === undefined) {
+ allNodes[nodeId].hiddenLabel = allNodes[nodeId].label;
+ allNodes[nodeId].label = undefined;
+ }
+ }
+ var connectedNodes = network.getConnectedNodes(selectedNode);
+ var allConnectedNodes = [];
+
+ // get the second degree nodes
+ for (i = 1; i < degrees; i++) {
+ for (j = 0; j < connectedNodes.length; j++) {
+ allConnectedNodes = allConnectedNodes.concat(
+ network.getConnectedNodes(connectedNodes[j])
+ );
+ }
+ }
+
+ // all second degree nodes get a different color and their label back
+ for (i = 0; i < allConnectedNodes.length; i++) {
+ // allNodes[allConnectedNodes[i]].color = "pink";
+ allNodes[allConnectedNodes[i]].color = "rgba(150,150,150,0.75)";
+ if (allNodes[allConnectedNodes[i]].hiddenLabel !== undefined) {
+ allNodes[allConnectedNodes[i]].label =
+ allNodes[allConnectedNodes[i]].hiddenLabel;
+ allNodes[allConnectedNodes[i]].hiddenLabel = undefined;
+ }
+ }
+
+ // all first degree nodes get their own color and their label back
+ for (i = 0; i < connectedNodes.length; i++) {
+ // allNodes[connectedNodes[i]].color = undefined;
+ allNodes[connectedNodes[i]].color = nodeColors[connectedNodes[i]];
+ if (allNodes[connectedNodes[i]].hiddenLabel !== undefined) {
+ allNodes[connectedNodes[i]].label =
+ allNodes[connectedNodes[i]].hiddenLabel;
+ allNodes[connectedNodes[i]].hiddenLabel = undefined;
+ }
+ }
+
+ // the main node gets its own color and its label back.
+ // allNodes[selectedNode].color = undefined;
+ allNodes[selectedNode].color = nodeColors[selectedNode];
+ if (allNodes[selectedNode].hiddenLabel !== undefined) {
+ allNodes[selectedNode].label = allNodes[selectedNode].hiddenLabel;
+ allNodes[selectedNode].hiddenLabel = undefined;
+ }
+ } else if (highlightActive === true) {
+ // console.log("highlightActive was true");
+ // reset all nodes
+ for (let nodeId in allNodes) {
+ // allNodes[nodeId].color = "purple";
+ allNodes[nodeId].color = nodeColors[nodeId];
+ // delete allNodes[nodeId].color;
+ if (allNodes[nodeId].hiddenLabel !== undefined) {
+ allNodes[nodeId].label = allNodes[nodeId].hiddenLabel;
+ allNodes[nodeId].hiddenLabel = undefined;
+ }
+ }
+ highlightActive = false;
+ }
+
+ // transform the object into an array
+ var updateArray = [];
+ if (params.nodes.length > 0) {
+ for (let nodeId in allNodes) {
+ if (allNodes.hasOwnProperty(nodeId)) {
+ // console.log(allNodes[nodeId]);
+ updateArray.push(allNodes[nodeId]);
+ }
+ }
+ nodes.update(updateArray);
+ } else {
+ // console.log("Nothing was selected");
+ for (let nodeId in allNodes) {
+ if (allNodes.hasOwnProperty(nodeId)) {
+ // console.log(allNodes[nodeId]);
+ // allNodes[nodeId].color = {};
+ updateArray.push(allNodes[nodeId]);
+ }
+ }
+ nodes.update(updateArray);
+ }
+}
+
+function filterHighlight(params) {
+ allNodes = nodes.get({ returnType: "Object" });
+ // if something is selected:
+ if (params.nodes.length > 0) {
+ filterActive = true;
+ let selectedNodes = params.nodes;
+
+ // hiding all nodes and saving the label
+ for (let nodeId in allNodes) {
+ allNodes[nodeId].hidden = true;
+ if (allNodes[nodeId].savedLabel === undefined) {
+ allNodes[nodeId].savedLabel = allNodes[nodeId].label;
+ allNodes[nodeId].label = undefined;
+ }
+ }
+
+ for (let i=0; i < selectedNodes.length; i++) {
+ allNodes[selectedNodes[i]].hidden = false;
+ if (allNodes[selectedNodes[i]].savedLabel !== undefined) {
+ allNodes[selectedNodes[i]].label = allNodes[selectedNodes[i]].savedLabel;
+ allNodes[selectedNodes[i]].savedLabel = undefined;
+ }
+ }
+
+ } else if (filterActive === true) {
+ // reset all nodes
+ for (let nodeId in allNodes) {
+ allNodes[nodeId].hidden = false;
+ if (allNodes[nodeId].savedLabel !== undefined) {
+ allNodes[nodeId].label = allNodes[nodeId].savedLabel;
+ allNodes[nodeId].savedLabel = undefined;
+ }
+ }
+ filterActive = false;
+ }
+
+ // transform the object into an array
+ var updateArray = [];
+ if (params.nodes.length > 0) {
+ for (let nodeId in allNodes) {
+ if (allNodes.hasOwnProperty(nodeId)) {
+ updateArray.push(allNodes[nodeId]);
+ }
+ }
+ nodes.update(updateArray);
+ } else {
+ for (let nodeId in allNodes) {
+ if (allNodes.hasOwnProperty(nodeId)) {
+ updateArray.push(allNodes[nodeId]);
+ }
+ }
+ nodes.update(updateArray);
+ }
+}
+
+function selectNode(nodes) {
+ network.selectNodes(nodes);
+ neighbourhoodHighlight({ nodes: nodes });
+ return nodes;
+}
+
+function selectNodes(nodes) {
+ network.selectNodes(nodes);
+ filterHighlight({nodes: nodes});
+ return nodes;
+}
+
+function highlightFilter(filter) {
+ let selectedNodes = []
+ let selectedProp = filter['property']
+ if (filter['item'] === 'node') {
+ let allNodes = nodes.get({ returnType: "Object" });
+ for (let nodeId in allNodes) {
+ if (allNodes[nodeId][selectedProp] && filter['value'].includes((allNodes[nodeId][selectedProp]).toString())) {
+ selectedNodes.push(nodeId)
+ }
+ }
+ }
+ else if (filter['item'] === 'edge'){
+ let allEdges = edges.get({returnType: 'object'});
+ // check if the selected property exists for selected edge and select the nodes connected to the edge
+ for (let edge in allEdges) {
+ if (allEdges[edge][selectedProp] && filter['value'].includes((allEdges[edge][selectedProp]).toString())) {
+ selectedNodes.push(allEdges[edge]['from'])
+ selectedNodes.push(allEdges[edge]['to'])
+ }
+ }
+ }
+ selectNodes(selectedNodes)
+}
\ No newline at end of file
diff --git a/lib/tom-select/tom-select.complete.min.js b/lib/tom-select/tom-select.complete.min.js
new file mode 100644
index 0000000..e2e0211
--- /dev/null
+++ b/lib/tom-select/tom-select.complete.min.js
@@ -0,0 +1,356 @@
+/**
+* Tom Select v2.0.0-rc.4
+* Licensed under the Apache License, Version 2.0 (the "License");
+*/
+!function(e,t){"object"==typeof exports&&"undefined"!=typeof module?module.exports=t():"function"==typeof define&&define.amd?define(t):(e="undefined"!=typeof globalThis?globalThis:e||self).TomSelect=t()}(this,(function(){"use strict"
+function e(e,t){e.split(/\s+/).forEach((e=>{t(e)}))}class t{constructor(){this._events={}}on(t,i){e(t,(e=>{this._events[e]=this._events[e]||[],this._events[e].push(i)}))}off(t,i){var s=arguments.length
+0!==s?e(t,(e=>{if(1===s)return delete this._events[e]
+e in this._events!=!1&&this._events[e].splice(this._events[e].indexOf(i),1)})):this._events={}}trigger(t,...i){var s=this
+e(t,(e=>{if(e in s._events!=!1)for(let t of s._events[e])t.apply(s,i)}))}}var i
+const s="[̀-ͯ·ʾ]",n=new RegExp(s,"g")
+var o
+const r={"æ":"ae","ⱥ":"a","ø":"o"},l=new RegExp(Object.keys(r).join("|"),"g"),a=[[67,67],[160,160],[192,438],[452,652],[961,961],[1019,1019],[1083,1083],[1281,1289],[1984,1984],[5095,5095],[7429,7441],[7545,7549],[7680,7935],[8580,8580],[9398,9449],[11360,11391],[42792,42793],[42802,42851],[42873,42897],[42912,42922],[64256,64260],[65313,65338],[65345,65370]],c=e=>e.normalize("NFKD").replace(n,"").toLowerCase().replace(l,(function(e){return r[e]})),d=(e,t="|")=>{if(1==e.length)return e[0]
+var i=1
+return e.forEach((e=>{i=Math.max(i,e.length)})),1==i?"["+e.join("")+"]":"(?:"+e.join(t)+")"},p=e=>{if(1===e.length)return[[e]]
+var t=[]
+return p(e.substring(1)).forEach((function(i){var s=i.slice(0)
+s[0]=e.charAt(0)+s[0],t.push(s),(s=i.slice(0)).unshift(e.charAt(0)),t.push(s)})),t},u=e=>{void 0===o&&(o=(()=>{var e={}
+a.forEach((t=>{for(let s=t[0];s<=t[1];s++){let t=String.fromCharCode(s),n=c(t)
+if(n!=t.toLowerCase()){n in e||(e[n]=[n])
+var i=new RegExp(d(e[n]),"iu")
+t.match(i)||e[n].push(t)}}}))
+var t=Object.keys(e)
+t=t.sort(((e,t)=>t.length-e.length)),i=new RegExp("("+d(t)+"[̀-ͯ·ʾ]*)","g")
+var s={}
+return t.sort(((e,t)=>e.length-t.length)).forEach((t=>{var i=p(t).map((t=>(t=t.map((t=>e.hasOwnProperty(t)?d(e[t]):t)),d(t,""))))
+s[t]=d(i)})),s})())
+return e.normalize("NFKD").toLowerCase().split(i).map((e=>{if(""==e)return""
+const t=c(e)
+if(o.hasOwnProperty(t))return o[t]
+const i=e.normalize("NFC")
+return i!=e?d([e,i]):e})).join("")},h=(e,t)=>{if(e)return e[t]},g=(e,t)=>{if(e){for(var i,s=t.split(".");(i=s.shift())&&(e=e[i]););return e}},f=(e,t,i)=>{var s,n
+return e?-1===(n=(e+="").search(t.regex))?0:(s=t.string.length/e.length,0===n&&(s+=.5),s*i):0},v=e=>(e+"").replace(/([\$\(-\+\.\?\[-\^\{-\}])/g,"\\$1"),m=(e,t)=>{var i=e[t]
+if("function"==typeof i)return i
+i&&!Array.isArray(i)&&(e[t]=[i])},y=(e,t)=>{if(Array.isArray(e))e.forEach(t)
+else for(var i in e)e.hasOwnProperty(i)&&t(e[i],i)},O=(e,t)=>"number"==typeof e&&"number"==typeof t?e>t?1:e
+
+
0?[0]:[]);if(o.enter().append("g").classed(f.containerClassName,!0).style("cursor","pointer"),o.exit().each((function(){n.select(this).selectAll("g."+f.headerGroupClassName).each(a)})).remove(),0!==r.length){var l=o.selectAll("g."+f.headerGroupClassName).data(r,p);l.enter().append("g").classed(f.headerGroupClassName,!0);for(var u=s.ensureSingle(o,"g",f.dropdownButtonGroupClassName,(function(t){t.style("pointer-events","all")})),c=0;c 90&&i.log("Long binary search..."),h-1},e.sorterAsc=function(t,e){return t-e},e.sorterDes=function(t,e){return e-t},e.distinctVals=function(t){var r,n=t.slice();for(n.sort(e.sorterAsc),r=n.length-1;r>-1&&n[r]===o;r--);for(var i,a=n[r]-n[0]||1,s=a/(r||1)/1e4,l=[],u=0;u<=r;u++){var c=n[u],f=c-i;void 0===i?(l.push(c),i=c):f>s&&(a=Math.min(a,f),l.push(c),i=c)}return{vals:l,minDiff:a}},e.roundUp=function(t,e,r){for(var n,i=0,a=e.length-1,o=0,s=r?0:1,l=r?1:0,u=r?Math.ceil:Math.floor;i0&&(n=1),r&&n)return t.sort(e)}return n?t:t.reverse()},e.findIndexOfMin=function(t,e){e=e||a;for(var r,n=1/0,i=0;il?r.y-l:0;return Math.sqrt(u*u+f*f)}for(var p=h(u);p;){if((u+=p+r)>f)return;p=h(u)}for(p=h(f);p;){if(u>(f-=p+r))return;p=h(f)}return{min:u,max:f,len:f-u,total:c,isClosed:0===u&&f===c&&Math.abs(n.x-i.x)<.1&&Math.abs(n.y-i.y)<.1}},e.findPointOnPath=function(t,e,r,n){for(var i,a,o,s=(n=n||{}).pathLength||t.getTotalLength(),l=n.tolerance||.001,u=n.iterationLimit||30,c=t.getPointAtLength(0)[r]>t.getPointAtLength(s)[r]?-1:1,f=0,h=0,p=s;f0?p=i:h=i,f++}return a}},33040:function(t,e,r){"use strict";var n=r(38248),i=r(49760),a=r(72160),o=r(8932),s=r(22548).defaultLine,l=r(38116).isArrayOrTypedArray,u=a(s);function c(t,e){var r=t;return r[3]*=e,r}function f(t){if(n(t))return u;var e=a(t);return e.length?e:u}function h(t){return n(t)?t:1}t.exports={formatColor:function(t,e,r){var n=t.color;n&&n._inputArray&&(n=n._inputArray);var i,s,p,d,v,g=l(n),y=l(e),m=o.extractOpts(t),x=[];if(i=void 0!==m.colorscale?o.makeColorScaleFuncFromTrace(t):f,s=g?function(t,e){return void 0===t[e]?u:a(i(t[e]))}:f,p=y?function(t,e){return void 0===t[e]?1:h(t[e])}:h,g||y)for(var b=0;b
/i;e.BR_TAG_ALL=/
/gi;var _=/(^|[\s"'])style\s*=\s*("([^"]*);?"|'([^']*);?')/i,w=/(^|[\s"'])href\s*=\s*("([^"]*)"|'([^']*)')/i,T=/(^|[\s"'])target\s*=\s*("([^"\s]*)"|'([^'\s]*)')/i,k=/(^|[\s"'])popup\s*=\s*("([\w=,]*)"|'([\w=,]*)')/i;function A(t,e){if(!t)return null;var r=t.match(e),n=r&&(r[3]||r[4]);return n&&L(n)}var M=/(^|;)\s*color:/;e.plainText=function(t,e){for(var r=void 0!==(e=e||{}).len&&-1!==e.len?e.len:1/0,n=void 0!==e.allowedTags?e.allowedTags:["br"],i=t.split(m),a=[],o="",s=0,l=0;l
"+l;e.text=u}(t,o,r,u):"log"===c?function(t,e,r,n,a){var o=t.dtick,l=e.x,u=t.tickformat,c="string"==typeof o&&o.charAt(0);if("never"===a&&(a=""),n&&"L"!==c&&(o="L3",c="L"),u||"L"===c)e.text=bt(Math.pow(10,l),t,a,n);else if(i(o)||"D"===c&&s.mod(l+.01,1)<.1){var f=Math.round(l),h=Math.abs(f),p=t.exponentformat;"power"===p||mt(p)&&xt(f)?(e.text=0===f?1:1===f?"10":"10"+(f>1?"":P)+h+"",e.fontSize*=1.25):("e"===p||"E"===p)&&h>2?e.text="1"+p+(f>0?"+":P)+h:(e.text=bt(Math.pow(10,l),t,"","fakehover"),"D1"===o&&"y"===t._id.charAt(0)&&(e.dy-=e.fontSize/6))}else{if("D"!==c)throw"unrecognized dtick "+String(o);e.text=String(Math.round(Math.pow(10,s.mod(l,1)))),e.fontSize*=.75}if("D1"===t.dtick){var d=String(e.text).charAt(0);"0"!==d&&"1"!==d||("y"===t._id.charAt(0)?e.dx-=e.fontSize/4:(e.dy+=e.fontSize/2,e.dx+=(t.range[1]>t.range[0]?1:-1)*e.fontSize*(l<0?.5:.25)))}}(t,o,0,u,g):"category"===c?function(t,e){var r=t._categories[Math.round(e.x)];void 0===r&&(r=""),e.text=String(r)}(t,o):"multicategory"===c?function(t,e,r){var n=Math.round(e.x),i=t._categories[n]||[],a=void 0===i[1]?"":String(i[1]),o=void 0===i[0]?"":String(i[0]);r?e.text=o+" - "+a:(e.text=a,e.text2=o)}(t,o,r):zt(t)?function(t,e,r,n,i){if("radians"!==t.thetaunit||r)e.text=bt(e.x,t,i,n);else{var a=e.x/180;if(0===a)e.text="0";else{var o=function(t){function e(t,e){return Math.abs(t-e)<=1e-6}var r=function(t){for(var r=1;!e(Math.round(t*r)/r,t);)r*=10;return r}(t),n=t*r,i=Math.abs(function t(r,n){return e(n,0)?r:t(n,r%n)}(n,r));return[Math.round(n/i),Math.round(r/i)]}(a);if(o[1]>=100)e.text=bt(s.deg2rad(e.x),t,i,n);else{var l=e.x<0;1===o[1]?1===o[0]?e.text="π":e.text=o[0]+"π":e.text=["",o[0],"","⁄","",o[1],"","π"].join(""),l&&(e.text=P+e.text)}}}}(t,o,r,u,g):function(t,e,r,n,i){"never"===i?i="":"all"===t.showexponent&&Math.abs(e.x/t.dtick)<1e-6&&(i="hide"),e.text=bt(e.x,t,i,n)}(t,o,0,u,g),n||(t.tickprefix&&!v(t.showtickprefix)&&(o.text=t.tickprefix+o.text),t.ticksuffix&&!v(t.showticksuffix)&&(o.text+=t.ticksuffix)),t.labelalias&&t.labelalias.hasOwnProperty(o.text)){var y=t.labelalias[o.text];"string"==typeof y&&(o.text=y)}return("boundaries"===t.tickson||t.showdividers)&&(o.xbnd=[h(o.x-.5),h(o.x+t.dtick-.5)]),o},H.hoverLabelText=function(t,e,r){r&&(t=s.extendFlat({},t,{hoverformat:r}));var n=s.isArrayOrTypedArray(e)?e[0]:e,i=s.isArrayOrTypedArray(e)?e[1]:void 0;if(void 0!==i&&i!==n)return H.hoverLabelText(t,n,r)+" - "+H.hoverLabelText(t,i,r);var a="log"===t.type&&n<=0,o=H.tickText(t,t.c2l(a?-n:n),"hover").text;return a?0===n?"0":P+o:o};var yt=["f","p","n","μ","m","","k","M","G","T"];function mt(t){return"SI"===t||"B"===t}function xt(t){return t>14||t<-15}function bt(t,e,r,n){var a=t<0,o=e._tickround,l=r||e.exponentformat||"B",u=e._tickexponent,c=H.getTickFormat(e),f=e.separatethousands;if(n){var h={exponentformat:l,minexponent:e.minexponent,dtick:"none"===e.showexponent?e.dtick:i(t)&&Math.abs(t)||1,range:"none"===e.showexponent?e.range.map(e.r2d):[0,t||1]};vt(h),o=(Number(h._tickround)||0)+4,u=h._tickexponent,e.hoverformat&&(c=e.hoverformat)}if(c)return e._numFormat(c)(t).replace(/-/g,P);var p,d=Math.pow(10,-o)/2;if("none"===l&&(u=0),(t=Math.abs(t))
")):x=h.textLabel;var L={x:h.traceCoordinate[0],y:h.traceCoordinate[1],z:h.traceCoordinate[2],data:_._input,fullData:_,curveNumber:_.index,pointNumber:T};d.appendArrayPointValue(L,_,T),t._module.eventData&&(L=_._module.eventData(L,h,_,{},T));var C={points:[L]};if(e.fullSceneLayout.hovermode){var P=[];d.loneHover({trace:_,x:(.5+.5*m[0]/m[3])*s,y:(.5-.5*m[1]/m[3])*l,xLabel:k.xLabel,yLabel:k.yLabel,zLabel:k.zLabel,text:x,name:c.name,color:d.castHoverOption(_,T,"bgcolor")||c.color,borderColor:d.castHoverOption(_,T,"bordercolor"),fontFamily:d.castHoverOption(_,T,"font.family"),fontSize:d.castHoverOption(_,T,"font.size"),fontColor:d.castHoverOption(_,T,"font.color"),nameLength:d.castHoverOption(_,T,"namelength"),textAlign:d.castHoverOption(_,T,"align"),hovertemplate:f.castOption(_,T,"hovertemplate"),hovertemplateLabels:f.extendFlat({},L,k),eventData:[L]},{container:n,gd:r,inOut_bbox:P}),L.bbox=P[0]}h.distance<5&&(h.buttons||w)?r.emit("plotly_click",C):r.emit("plotly_hover",C),this.oldEventData=C}else d.loneUnhover(n),this.oldEventData&&r.emit("plotly_unhover",this.oldEventData),this.oldEventData=void 0;e.drawAnnotations(e)},k.recoverContext=function(){var t=this;t.glplot.dispose();var e=function(){t.glplot.gl.isContextLost()?requestAnimationFrame(e):t.initializeGLPlot()?t.plot.apply(t,t.plotArgs):f.error("Catastrophic and unrecoverable WebGL error. Context lost.")};requestAnimationFrame(e)};var M=["xaxis","yaxis","zaxis"];function S(t,e,r){for(var n=t.fullSceneLayout,i=0;i<3;i++){var a=M[i],o=a.charAt(0),s=n[a],l=e[o],u=e[o+"calendar"],c=e["_"+o+"length"];if(f.isArrayOrTypedArray(l))for(var h,p=0;p<(c||l.length);p++)if(f.isArrayOrTypedArray(l[p]))for(var d=0;d
");b.text(T).attr("data-unformatted",T).call(f.convertToTspans,t),_=c.bBox(b.node())}b.attr("transform",a(-3,8-_.height)),x.insert("rect",".static-attribution").attr({x:-_.width-6,y:-_.height-3,width:_.width+6,height:_.height+3,fill:"rgba(255, 255, 255, 0.75)"});var k=1;_.width+6>w&&(k=w/(_.width+6));var A=[n.l+n.w*h.x[1],n.t+n.h*(1-h.y[0])];x.attr("transform",a(A[0],A[1])+o(k))}},e.updateFx=function(t){for(var e=t._fullLayout,r=e._subplots[p],n=0;n