Adding initial ABSA implementation

UCREL · Aug 18, 2024 · d1b5b6f · d1b5b6f
2 parents 5886301 + 9a286aa
commit d1b5b6f
Show file tree

Hide file tree

Showing 16 changed files with 659 additions and 121 deletions.
diff --git a/Keyword_collocation.py b/Keyword_collocation.py
@@ -440,6 +440,9 @@ def tag_semantics(self, text):
             # Convert the list back to a string and read it into a DataFrame
             cleaned_text = '\n'.join(cleaned_lines)
 
+            print("response from ucrel")
+            print(cleaned_text)
+
             en_tagged = pd.read_csv(io.StringIO(
                 cleaned_text), sep='\t', names=columns, header=None)
 

diff --git a/__pycache__/Keyword_collocation.cpython-312.pyc b/__pycache__/Keyword_collocation.cpython-312.pyc
diff --git a/__pycache__/sentiment_analyser.cpython-312.pyc b/__pycache__/sentiment_analyser.cpython-312.pyc
diff --git a/__pycache__/word_cloud_generator.cpython-312.pyc b/__pycache__/word_cloud_generator.cpython-312.pyc
diff --git a/sentiment_analyser.py b/sentiment_analyser.py
@@ -8,6 +8,8 @@
 import time
 import scattertext as st
 import spacy
+from pyabsa import AspectPolarityClassification as APC, available_checkpoints
+
 nlp = spacy.load('en_core_web_sm-3.2.0')  # Load the spaCy model
 nlp.max_length = 9000000
 
@@ -141,9 +143,86 @@ def analyse_sentiment(self, input_text, language, num_classes, max_seq_len=512):
                 sentiments.append(
                     (original_review, sentiment_label, sentiment_score))
                 sentiment_counts[sentiment_label] += 1
-        # print(sentiment_counts)
         return sentiments, sentiment_counts
 
+    def find_aspects(self, rows, aspects, includeGlobalSentiments=False):
+        """
+        Searches text and finds aspects, ready for analysis.
+
+        Parameters:
+        rows (list[str]): The text to be searched for aspects.
+        aspects (list[str]): The aspects to find in the provided rows.
+        includeGlobalSentiments (boolean): If True, rows with Global Sentiments will be included (rows that do not contain any of the entered aspects).
+
+        Returns:
+        list[str]: The updated rows with the targeted aspects, surrounded by [B-ASP] example [E-ASP].
+        """
+
+        # Removes any trailing or leading whitespace, converts to lower case
+        aspects = [aspect.lower().strip() for aspect in aspects]
+
+        # Sort aspects by length in descending order to prioritize longer matches
+        aspects.sort(key=len, reverse=True)
+
+        # Pattern to match aspect as a whole word
+        pattern = r'\b{}\b'
+
+        if includeGlobalSentiments:
+            rows = [row.strip() for row in rows]
+        else:
+            # Filters out any rows that do not have any of the entered aspects
+            rows = [row.strip() for row in rows if any(
+                re.search(pattern.format(re.escape(aspect)), row.lower()) for aspect in aspects)]
+
+        modified_rows = []
+
+        for row in rows:
+            for aspect in aspects:
+                # Check if the aspect is not already within [B-ASP]...[E-ASP]
+                escaped_aspect = re.escape(aspect)
+                if not re.search(rf'\[B-ASP\].*?{escaped_aspect}.*?\[E-ASP\]', row, re.IGNORECASE):
+                    # Mark the aspect in the row
+                    row = re.sub(pattern.format(escaped_aspect),
+                                 r'[B-ASP]\g<0>[E-ASP]', row, flags=re.IGNORECASE)
+
+            modified_rows.append(row)
+
+        return modified_rows
+
+    # Aspect-Based Sentiment Analysis
+    def analyse_aspects_sentiment(self, rows, aspects, includeGlobalSentiments=False):
+        ckpts = available_checkpoints()
+        sentiment_classifier = APC.SentimentClassifier(
+            checkpoint="english"
+        )
+
+        if includeGlobalSentiments:
+            rows = self.find_aspects(
+                rows, aspects, True) if rows else []
+        else:
+            rows = self.find_aspects(
+                rows, aspects) if rows else []
+
+        if len(rows) < 1:
+            return Exception("Error, no data to analyse")
+
+        results = []
+
+        for row in rows:
+            sentiment_result = sentiment_classifier.predict(
+                text=row,
+                ignore_error=True,
+                eval_batch_size=32,
+            )
+
+            # Converts numpy arrays to python lists, for json
+            sentiment_result["probs"] = [np_arr.tolist()
+                                         for np_arr in sentiment_result["probs"]]
+
+            results.append(sentiment_result)
+
+        return results
+
     def generate_scattertext_visualization(self, dfanalysis, language):
         # Get the DataFrame with sentiment analysis results
         df = dfanalysis

diff --git a/website/File_analysis.py b/website/File_analysis.py
@@ -27,10 +27,10 @@
 import threading
 import json
 import humanize
-
 import string
 from nltk.corpus import stopwords
 from nltk.tokenize import sent_tokenize
+
 en_stopwords = list(stopwords.words('english'))
 cy_stopwords = open('website/data/welsh_stopwords.txt', 'r',
                     # replaced 'utf8' with 'iso-8859-1'
@@ -168,13 +168,6 @@ def fileanalysis():
         elif input_method == 'example':
             example_file = request.form.get('example-data')
 
-            print()
-            print()
-            print("example file")
-            print(example_file)
-            print()
-            print()
-
             if not example_file:
                 # Handle the error - maybe raise an exception or return an error response.
                 current_app.logger.error(
@@ -186,12 +179,6 @@ def fileanalysis():
             file_extension = os.path.splitext(
                 file_path)[1].lower()  # Extract the file extension
 
-            print()
-            print()
-            print("file path and ext")
-            print(file_path)
-            print(file_extension)
-
             # Differentiate the behavior based on the file extension
             if file_extension == '.txt':
                 with open(file_path, 'r', encoding='utf-8') as f:
@@ -441,6 +428,8 @@ def sentiment_analysis(sentences, language, sentiment_classes=3):
         return data, sentiment_counts, plot_html_pie, plot_html_bar
     return None, None, None
 
+#! Update for ABSA
+
 
 @FileAnalysis.route('/update_sentiment', methods=['POST'])
 def handle_sentiment_update():
@@ -546,25 +535,25 @@ def handle_selected_rows():
     random_word = random.choice(list(sorted_word_frequencies.keys()))
     search_word = random_word
     # Initialize the KWICAnalyser with the merged rows
+
+    # ucrel api used
     analyser = KWICAnalyser(' '.join(merged_rows), language='en')
     # Adding to session for word cloud to use
     session["tokens_with_semantic_tags"] = analyser.tokens_with_semantic_tags
 
-    # Get the sorted unique list of semantic tags
+    # ucrel api used
     sorted_unique_tags = analyser.get_sorted_unique_tags()
     word_frequencies = analyser.get_word_frequencies()
 
     unfiltered_word_frequencies = analyser.get_word_frequencies(
         isUnfiltered=True)
     session['unfiltered_word_frequencies'] = unfiltered_word_frequencies
     session['word_frequencies'] = word_frequencies
+
     session['mergedData'] = merged_rows
     session['sentiment_data'] = sentiment_data
 
     summary = summarize_text(merged_rows)
-
-    current_app.logger.info("/process_rows completed")
-
     return jsonify({
         "status": "success",
         "wordFrequencies": word_frequencies,
@@ -1139,17 +1128,6 @@ def regenerate_wordcloud():
             tag_words_associations = {tag: list({word for (word, pos, tag_entry) in words_tags if tag == tag_entry}) for (
                 word, pos, tag) in words_tags if tag in word_list}
 
-            print()
-            print()
-            print("tag and word associations")
-            print(tag_words_associations)
-
-            print("word lists")
-            print(word_list)
-            print(sec_word_list)
-            print()
-            print()
-
             session['sec_word_cloud_src'] = sec_wc_path
 
             json_data = {
@@ -1207,3 +1185,132 @@ def get_collos_data():
     except Exception as e:
 
         return "Server encountered an error", 500
+
+
+@FileAnalysis.route('/aspect-based-analysis', methods=['POST'])
+def aspect_based_sentiment_analysis():
+    data = request.get_json()
+    rows_data = data.get("rows", [])
+    aspects_data = data.get("aspects", [])
+    global_sentiments_data = data.get("includeGlobalSentiments", False)
+    language = data.get("language", "en")
+
+    analyser = SentimentAnalyser()
+
+    # Analyses aspect sentiment for each row in rows_data
+    try:
+        if not rows_data:
+            return jsonify({"status": "error", "message": "No rows data provided. Please provide text data to analyze."}), 400
+
+        if not aspects_data:
+            return jsonify({"status": "error", "message": "No aspects provided. Please provide aspects to analyse in the text."}), 400
+
+        if len(aspects_data) > 10:
+            return jsonify({"status": "error", "message": f"Too many aspects provided. You can include a maximum of 10 aspects. Current count: {len(aspects_data)}."}), 400
+
+        char_count = 0
+        for aspect in aspects_data:
+            char_count += len(aspect)
+
+        if char_count > 500:
+            return jsonify({"status": "error", "message": f"Too many characters provided. Your total aspect input must be less than 500 characters. Current count: {char_count}."}), 400
+
+        results = analyser.analyse_aspects_sentiment(
+            rows=rows_data, aspects=aspects_data, includeGlobalSentiments=global_sentiments_data)
+
+        if isinstance(results, Exception):
+            return jsonify({"status": "error", "message": f"No data to analyse for entered aspect(s).\n\nAspects: {*aspects_data,}"}), 400
+
+    except Exception as e:
+        current_app.logger.exception(f"Error: {e}")
+        return jsonify({"status": "error", "message": f"Error: {e}. Please try again."}), 500
+
+    sentiment_data = [{"Review": result["text"], "Aspect": aspect, "Sentiment Label": result["sentiment"][idx],
+                      "Confidence Score": round(result["confidence"][idx], 2)} for result in results for idx, aspect in enumerate(result["aspect"])]
+
+    # Count sentiment results for pie chart
+    aspect_sentiment_counter = {aspect: {
+        "Positive": 0, "Neutral": 0, "Negative": 0} for aspect in aspects_data}
+
+    for entry in results:
+        for idx, asp in enumerate(entry["aspect"]):
+            # Ignores case
+            asp = asp.lower()
+
+            if asp in aspect_sentiment_counter:
+                aspect_sentiment_counter[asp][entry["sentiment"][idx]] += 1
+
+    # Remove previous absa plots
+    remove_previous_plots(
+        "website/static/Sentiment_plots", "sentiment_pie_absa_")
+
+    color_map = {
+        "Very negative": "#ff3333",
+        "Negative": "#ff8a3d",
+        "Neutral": "#b0b0b0",
+        "Positive": "#c5e17a",
+        "Very positive": "#6ebd45",
+        "Negyddol Iawn": "#ff3333",
+        "Negyddol": "#ff8a3d",
+        "Niwtral": "#b0b0b0",
+        "Cadarnhaol": "#c5e17a",
+        "Cadarnhaol Iawn": "#6ebd45"
+    }
+
+    # Sort results by highest occuring sentiment
+    aspect_sentiment_counter = dict(sorted(aspect_sentiment_counter.items(
+    ), key=lambda item: item[1]["Positive"] + item[1]["Neutral"] + item[1]["Negative"], reverse=True))
+
+    html_plots = []
+    plot_title = "Sentiment Distribution for: " if language == "en" else "Dosbarthiad Sentiment ar gyfer: "
+
+    for aspect, dict_val in aspect_sentiment_counter.items():
+        # If aspect is not in dataset, do not generate pie chart
+        if all(val == 0 for val in dict_val.values()):
+            continue
+
+        fig = px.pie(values=dict_val.values(), names=dict_val.keys(),
+                     title=f"{plot_title}{aspect}", color=dict_val.keys(),
+                     color_discrete_map=color_map)
+
+        plot_html_pie = fig.to_html(full_html=False)
+
+        fig.write_image(
+            f"website/static/Sentiment_plots/sentiment_pie_absa_{aspect}.png")
+        fig.write_html(
+            f"website/static/Sentiment_plots/sentiment_pie_absa_{aspect}.html")
+        with open(f"website/static/Sentiment_plots/sentiment_pie_absa_{aspect}.html", "r", encoding="utf-8") as f:
+            content = f.read()
+
+            # Add the "Visualisation by" text and logo image at the bottom
+            addition = """
+        <div style="text-align:center; margin-top:30px;">
+            Visualisation by <img src="https://ucrel-freetxt-2.lancs.ac.uk/static/images/logo.png" alt="Logo" style="height:40px;">
+        </div>
+        """
+
+            # Append the new content just before the closing body tag
+            content = content.replace("</body>", addition + "\n</body>")
+
+            with open(f"website/static/Sentiment_plots/sentiment_pie_absa_{aspect}.html", "w", encoding="utf-8") as f:
+                f.write(content)
+
+            # Tuple containing the plot, and number of total occurrences
+            html_plots.append((plot_html_pie, sum(dict_val.values())))
+
+    return jsonify({
+        "status": "success",
+        "plots": html_plots,
+        "sentimentData": sentiment_data
+    })
+
+
+def remove_previous_plots(directory, substring):
+    for filename in os.listdir(directory):
+        if substring in filename:
+            file_path = os.path.join(directory, filename)
+            try:
+                os.remove(file_path)
+                # print("Removed file:", file_path)
+            except Exception as e:
+                current_app.logger.exception("Error removing old plots", e)
diff --git a/website/__pycache__/File_analysis.cpython-312.pyc b/website/__pycache__/File_analysis.cpython-312.pyc
diff --git a/website/__pycache__/__init__.cpython-312.pyc b/website/__pycache__/__init__.cpython-312.pyc
diff --git a/website/static/Sentiment_plots/sentiment_bar.html b/website/static/Sentiment_plots/sentiment_bar.html
diff --git a/website/static/Sentiment_plots/sentiment_bar.png b/website/static/Sentiment_plots/sentiment_bar.png
diff --git a/website/static/Sentiment_plots/sentiment_pie.html b/website/static/Sentiment_plots/sentiment_pie.html
diff --git a/website/static/Sentiment_plots/sentiment_pie.png b/website/static/Sentiment_plots/sentiment_pie.png
diff --git a/website/static/css/style-file-analysis.css b/website/static/css/style-file-analysis.css
@@ -46,6 +46,15 @@
   resize: vertical;
 }
 
+#absa-aspects-to-analyze {
+  height: 40px;
+  padding: 10px;
+  font-size: 16px;
+  border: 1px solid #ccc;
+  border-radius: 5px;
+  resize: vertical;
+}
+
 .analyze-btn {
   display: flex;
   align-items: center;

diff --git a/website/static/css/style.css b/website/static/css/style.css
@@ -583,6 +583,8 @@ samp {
 }
 
 pre {
+  font-family: -apple-system, BlinkMacSystemFont, "Helvetica Neue", Helvetica,
+    sans-serif !important;
   border: 0.1rem solid #dcd7ca;
   line-height: 1.5;
   margin: 4rem 0;