Skip to content

Commit

Permalink
Adding initial ABSA implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
james-hulud committed Aug 18, 2024
2 parents 5886301 + 9a286aa commit d1b5b6f
Show file tree
Hide file tree
Showing 16 changed files with 659 additions and 121 deletions.
3 changes: 3 additions & 0 deletions Keyword_collocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,9 @@ def tag_semantics(self, text):
# Convert the list back to a string and read it into a DataFrame
cleaned_text = '\n'.join(cleaned_lines)

print("response from ucrel")
print(cleaned_text)

en_tagged = pd.read_csv(io.StringIO(
cleaned_text), sep='\t', names=columns, header=None)

Expand Down
Binary file modified __pycache__/Keyword_collocation.cpython-312.pyc
Binary file not shown.
Binary file modified __pycache__/sentiment_analyser.cpython-312.pyc
Binary file not shown.
Binary file modified __pycache__/word_cloud_generator.cpython-312.pyc
Binary file not shown.
81 changes: 80 additions & 1 deletion sentiment_analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import time
import scattertext as st
import spacy
from pyabsa import AspectPolarityClassification as APC, available_checkpoints

nlp = spacy.load('en_core_web_sm-3.2.0') # Load the spaCy model
nlp.max_length = 9000000

Expand Down Expand Up @@ -141,9 +143,86 @@ def analyse_sentiment(self, input_text, language, num_classes, max_seq_len=512):
sentiments.append(
(original_review, sentiment_label, sentiment_score))
sentiment_counts[sentiment_label] += 1
# print(sentiment_counts)
return sentiments, sentiment_counts

def find_aspects(self, rows, aspects, includeGlobalSentiments=False):
"""
Searches text and finds aspects, ready for analysis.
Parameters:
rows (list[str]): The text to be searched for aspects.
aspects (list[str]): The aspects to find in the provided rows.
includeGlobalSentiments (boolean): If True, rows with Global Sentiments will be included (rows that do not contain any of the entered aspects).
Returns:
list[str]: The updated rows with the targeted aspects, surrounded by [B-ASP] example [E-ASP].
"""

# Removes any trailing or leading whitespace, converts to lower case
aspects = [aspect.lower().strip() for aspect in aspects]

# Sort aspects by length in descending order to prioritize longer matches
aspects.sort(key=len, reverse=True)

# Pattern to match aspect as a whole word
pattern = r'\b{}\b'

if includeGlobalSentiments:
rows = [row.strip() for row in rows]
else:
# Filters out any rows that do not have any of the entered aspects
rows = [row.strip() for row in rows if any(
re.search(pattern.format(re.escape(aspect)), row.lower()) for aspect in aspects)]

modified_rows = []

for row in rows:
for aspect in aspects:
# Check if the aspect is not already within [B-ASP]...[E-ASP]
escaped_aspect = re.escape(aspect)
if not re.search(rf'\[B-ASP\].*?{escaped_aspect}.*?\[E-ASP\]', row, re.IGNORECASE):
# Mark the aspect in the row
row = re.sub(pattern.format(escaped_aspect),
r'[B-ASP]\g<0>[E-ASP]', row, flags=re.IGNORECASE)

modified_rows.append(row)

return modified_rows

# Aspect-Based Sentiment Analysis
def analyse_aspects_sentiment(self, rows, aspects, includeGlobalSentiments=False):
ckpts = available_checkpoints()
sentiment_classifier = APC.SentimentClassifier(
checkpoint="english"
)

if includeGlobalSentiments:
rows = self.find_aspects(
rows, aspects, True) if rows else []
else:
rows = self.find_aspects(
rows, aspects) if rows else []

if len(rows) < 1:
return Exception("Error, no data to analyse")

results = []

for row in rows:
sentiment_result = sentiment_classifier.predict(
text=row,
ignore_error=True,
eval_batch_size=32,
)

# Converts numpy arrays to python lists, for json
sentiment_result["probs"] = [np_arr.tolist()
for np_arr in sentiment_result["probs"]]

results.append(sentiment_result)

return results

def generate_scattertext_visualization(self, dfanalysis, language):
# Get the DataFrame with sentiment analysis results
df = dfanalysis
Expand Down
165 changes: 136 additions & 29 deletions website/File_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
import threading
import json
import humanize

import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

en_stopwords = list(stopwords.words('english'))
cy_stopwords = open('website/data/welsh_stopwords.txt', 'r',
# replaced 'utf8' with 'iso-8859-1'
Expand Down Expand Up @@ -168,13 +168,6 @@ def fileanalysis():
elif input_method == 'example':
example_file = request.form.get('example-data')

print()
print()
print("example file")
print(example_file)
print()
print()

if not example_file:
# Handle the error - maybe raise an exception or return an error response.
current_app.logger.error(
Expand All @@ -186,12 +179,6 @@ def fileanalysis():
file_extension = os.path.splitext(
file_path)[1].lower() # Extract the file extension

print()
print()
print("file path and ext")
print(file_path)
print(file_extension)

# Differentiate the behavior based on the file extension
if file_extension == '.txt':
with open(file_path, 'r', encoding='utf-8') as f:
Expand Down Expand Up @@ -441,6 +428,8 @@ def sentiment_analysis(sentences, language, sentiment_classes=3):
return data, sentiment_counts, plot_html_pie, plot_html_bar
return None, None, None

#! Update for ABSA


@FileAnalysis.route('/update_sentiment', methods=['POST'])
def handle_sentiment_update():
Expand Down Expand Up @@ -546,25 +535,25 @@ def handle_selected_rows():
random_word = random.choice(list(sorted_word_frequencies.keys()))
search_word = random_word
# Initialize the KWICAnalyser with the merged rows

# ucrel api used
analyser = KWICAnalyser(' '.join(merged_rows), language='en')
# Adding to session for word cloud to use
session["tokens_with_semantic_tags"] = analyser.tokens_with_semantic_tags

# Get the sorted unique list of semantic tags
# ucrel api used
sorted_unique_tags = analyser.get_sorted_unique_tags()
word_frequencies = analyser.get_word_frequencies()

unfiltered_word_frequencies = analyser.get_word_frequencies(
isUnfiltered=True)
session['unfiltered_word_frequencies'] = unfiltered_word_frequencies
session['word_frequencies'] = word_frequencies

session['mergedData'] = merged_rows
session['sentiment_data'] = sentiment_data

summary = summarize_text(merged_rows)

current_app.logger.info("/process_rows completed")

return jsonify({
"status": "success",
"wordFrequencies": word_frequencies,
Expand Down Expand Up @@ -1139,17 +1128,6 @@ def regenerate_wordcloud():
tag_words_associations = {tag: list({word for (word, pos, tag_entry) in words_tags if tag == tag_entry}) for (
word, pos, tag) in words_tags if tag in word_list}

print()
print()
print("tag and word associations")
print(tag_words_associations)

print("word lists")
print(word_list)
print(sec_word_list)
print()
print()

session['sec_word_cloud_src'] = sec_wc_path

json_data = {
Expand Down Expand Up @@ -1207,3 +1185,132 @@ def get_collos_data():
except Exception as e:

return "Server encountered an error", 500


@FileAnalysis.route('/aspect-based-analysis', methods=['POST'])
def aspect_based_sentiment_analysis():
data = request.get_json()
rows_data = data.get("rows", [])
aspects_data = data.get("aspects", [])
global_sentiments_data = data.get("includeGlobalSentiments", False)
language = data.get("language", "en")

analyser = SentimentAnalyser()

# Analyses aspect sentiment for each row in rows_data
try:
if not rows_data:
return jsonify({"status": "error", "message": "No rows data provided. Please provide text data to analyze."}), 400

if not aspects_data:
return jsonify({"status": "error", "message": "No aspects provided. Please provide aspects to analyse in the text."}), 400

if len(aspects_data) > 10:
return jsonify({"status": "error", "message": f"Too many aspects provided. You can include a maximum of 10 aspects. Current count: {len(aspects_data)}."}), 400

char_count = 0
for aspect in aspects_data:
char_count += len(aspect)

if char_count > 500:
return jsonify({"status": "error", "message": f"Too many characters provided. Your total aspect input must be less than 500 characters. Current count: {char_count}."}), 400

results = analyser.analyse_aspects_sentiment(
rows=rows_data, aspects=aspects_data, includeGlobalSentiments=global_sentiments_data)

if isinstance(results, Exception):
return jsonify({"status": "error", "message": f"No data to analyse for entered aspect(s).\n\nAspects: {*aspects_data,}"}), 400

except Exception as e:
current_app.logger.exception(f"Error: {e}")
return jsonify({"status": "error", "message": f"Error: {e}. Please try again."}), 500

sentiment_data = [{"Review": result["text"], "Aspect": aspect, "Sentiment Label": result["sentiment"][idx],
"Confidence Score": round(result["confidence"][idx], 2)} for result in results for idx, aspect in enumerate(result["aspect"])]

# Count sentiment results for pie chart
aspect_sentiment_counter = {aspect: {
"Positive": 0, "Neutral": 0, "Negative": 0} for aspect in aspects_data}

for entry in results:
for idx, asp in enumerate(entry["aspect"]):
# Ignores case
asp = asp.lower()

if asp in aspect_sentiment_counter:
aspect_sentiment_counter[asp][entry["sentiment"][idx]] += 1

# Remove previous absa plots
remove_previous_plots(
"website/static/Sentiment_plots", "sentiment_pie_absa_")

color_map = {
"Very negative": "#ff3333",
"Negative": "#ff8a3d",
"Neutral": "#b0b0b0",
"Positive": "#c5e17a",
"Very positive": "#6ebd45",
"Negyddol Iawn": "#ff3333",
"Negyddol": "#ff8a3d",
"Niwtral": "#b0b0b0",
"Cadarnhaol": "#c5e17a",
"Cadarnhaol Iawn": "#6ebd45"
}

# Sort results by highest occuring sentiment
aspect_sentiment_counter = dict(sorted(aspect_sentiment_counter.items(
), key=lambda item: item[1]["Positive"] + item[1]["Neutral"] + item[1]["Negative"], reverse=True))

html_plots = []
plot_title = "Sentiment Distribution for: " if language == "en" else "Dosbarthiad Sentiment ar gyfer: "

for aspect, dict_val in aspect_sentiment_counter.items():
# If aspect is not in dataset, do not generate pie chart
if all(val == 0 for val in dict_val.values()):
continue

fig = px.pie(values=dict_val.values(), names=dict_val.keys(),
title=f"{plot_title}{aspect}", color=dict_val.keys(),
color_discrete_map=color_map)

plot_html_pie = fig.to_html(full_html=False)

fig.write_image(
f"website/static/Sentiment_plots/sentiment_pie_absa_{aspect}.png")
fig.write_html(
f"website/static/Sentiment_plots/sentiment_pie_absa_{aspect}.html")
with open(f"website/static/Sentiment_plots/sentiment_pie_absa_{aspect}.html", "r", encoding="utf-8") as f:
content = f.read()

# Add the "Visualisation by" text and logo image at the bottom
addition = """
<div style="text-align:center; margin-top:30px;">
Visualisation by <img src="https://ucrel-freetxt-2.lancs.ac.uk/static/images/logo.png" alt="Logo" style="height:40px;">
</div>
"""

# Append the new content just before the closing body tag
content = content.replace("</body>", addition + "\n</body>")

with open(f"website/static/Sentiment_plots/sentiment_pie_absa_{aspect}.html", "w", encoding="utf-8") as f:
f.write(content)

# Tuple containing the plot, and number of total occurrences
html_plots.append((plot_html_pie, sum(dict_val.values())))

return jsonify({
"status": "success",
"plots": html_plots,
"sentimentData": sentiment_data
})


def remove_previous_plots(directory, substring):
for filename in os.listdir(directory):
if substring in filename:
file_path = os.path.join(directory, filename)
try:
os.remove(file_path)
# print("Removed file:", file_path)
except Exception as e:
current_app.logger.exception("Error removing old plots", e)
Binary file modified website/__pycache__/File_analysis.cpython-312.pyc
Binary file not shown.
Binary file modified website/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion website/static/Sentiment_plots/sentiment_bar.html

Large diffs are not rendered by default.

Binary file added website/static/Sentiment_plots/sentiment_bar.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion website/static/Sentiment_plots/sentiment_pie.html

Large diffs are not rendered by default.

Binary file modified website/static/Sentiment_plots/sentiment_pie.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 9 additions & 0 deletions website/static/css/style-file-analysis.css
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@
resize: vertical;
}

#absa-aspects-to-analyze {
height: 40px;
padding: 10px;
font-size: 16px;
border: 1px solid #ccc;
border-radius: 5px;
resize: vertical;
}

.analyze-btn {
display: flex;
align-items: center;
Expand Down
2 changes: 2 additions & 0 deletions website/static/css/style.css
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,8 @@ samp {
}

pre {
font-family: -apple-system, BlinkMacSystemFont, "Helvetica Neue", Helvetica,
sans-serif !important;
border: 0.1rem solid #dcd7ca;
line-height: 1.5;
margin: 4rem 0;
Expand Down
Loading

0 comments on commit d1b5b6f

Please sign in to comment.