Merge pull request #128 from SynBioDex/change_dependencies

Change dependencies and optimize each python module
SynBioDex · Sep 15, 2024 · 2306015 · 2306015
2 parents d989c97 + 3370a59
commit 2306015
Show file tree

Hide file tree

Showing 19 changed files with 921 additions and 1,200 deletions.
diff --git a/flask/cluster.py b/flask/cluster.py
@@ -1,32 +1,32 @@
 from xml.etree import ElementTree
 import subprocess
-import utils
+from configManager import ConfigManager
+from logger import Logger
 import query
 from sys import platform
 
-
-uclust_identity = utils.get_config()['uclust_identity'] # how similar sequences in the same cluster must be
+config_manager = ConfigManager()
+config = config_manager.load_config()  # Load config once
+uclust_identity = config['uclust_identity']  # Get the uclust identity value
+logger_ = Logger()
 sequences_filename = 'dumps/sequences.fsa'
 
-if 'which_search' not in utils.get_config():
-    explorerConfig = utils.get_config()
-    explorerConfig['which_search'] = 'vsearch'
-    utils.set_config(explorerConfig)
+# Ensure 'which_search' is set in config
+if 'which_search' not in config:
+    config['which_search'] = 'vsearch'
+    config_manager.save_config(config)
 
-whichSearch = utils.get_config()['which_search']
+whichSearch = config['which_search']
 
-if platform == "linux" or platform == "linux2":
-    if whichSearch == 'usearch':
-        usearch_binary_filename = 'usearch/usearch10.0.240_i86linux32'
-    elif whichSearch == 'vsearch':
-        usearch_binary_filename = 'usearch/vsearch_linux'
+# Determine the correct binary filename based on OS and search tool
+usearch_binary_filename = None
+if platform.startswith("linux"):
+    usearch_binary_filename = 'usearch/vsearch_linux' if whichSearch == 'vsearch' else 'usearch/usearch10.0.240_i86linux32'
 elif platform == "darwin":
-    if whichSearch == 'usearch':
-        usearch_binary_filename = 'usearch/usearch11.0.667_i86osx32'
-    elif whichSearch == 'vsearch':
-        usearch_binary_filename = 'usearch/vsearch_macos'
+    usearch_binary_filename = 'usearch/vsearch_macos' if whichSearch == 'vsearch' else 'usearch/usearch11.0.667_i86osx32'
 else:
-    utils.log("Sorry, your OS is not supported for sequence based-search.")
+    logger_.log("Sorry, your OS is not supported for sequence-based search.")
+    raise SystemExit
 
 uclust_results_filename = 'usearch/uclust_results.uc'
 
@@ -40,115 +40,73 @@
 }
 '''
 
-
 def write_fasta(sequences):
-    f = open(sequences_filename, 'w')
-
-    for sequence in sequences:
-        f.write('>%s\n' % sequence['subject'])
-        f.write('%s\n' % sequence['sequence'])
-
-    f.close()
-
+    with open(sequences_filename, 'w') as f:
+        for sequence in sequences:
+            f.write(f">{sequence['subject']}\n{sequence['sequence']}\n")
 
 def run_uclust():
     args = [usearch_binary_filename, '-cluster_fast', sequences_filename, '-id', uclust_identity, '-sort', 'length', '-uc', uclust_results_filename]
-    popen = subprocess.Popen(args, stdout=subprocess.PIPE)
-    popen.wait()
-    output = popen.stdout.read()
-    utils.log_indexing(str(output))
-
+    # result = subprocess.run(args, capture_output=True, text=True) # Python3.7
+    result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+    logger_.log(result.stdout, True)
 
 def analyze_uclust():
-    f = open(uclust_results_filename, 'r')
-    results = f.read()
-
     total_parts = 0
     total_identity = 0.0
     hits = 0
 
-    lines = results.splitlines()
-    for line in lines:
-        line = line.split()
-        record_type = line[0]
-
-        if record_type in ('H', 'S'):
-            total_parts += 1
-
-            if line[0] is 'H':
-                total_identity += float(line[3])
-                hits += 1
-
-    f.close()
-    utils.log_indexing('parts: ' + str(total_parts))
-    utils.log_indexing('hits: ' + str(hits))
-
+    with open(uclust_results_filename, 'r') as f:
+        for line in f:
+            parts = line.split()
+            record_type = parts[0]
+            if record_type in ('H', 'S'):
+                total_parts += 1
+                if record_type == 'H':
+                    total_identity += float(parts[3])
+                    hits += 1
+
+    logger_.log(f'parts: {total_parts}', True)
+    logger_.log(f'hits: {hits}', True)
     if hits > 0:
-        utils.log_indexing('average hit identity: ' + str(total_identity / hits))
-
+        logger_.log(f'average hit identity: {total_identity / hits}', True)
 
 def uclust2uris(fileName):
     uris = set()
-
-    f = open(fileName, 'r')
-    results = f.read()
-    lines = results.splitlines()
-
-    for line in lines:
-        line = line.split()
-
-        if line[0] is 'H':
-            partURI = line[9]
-
-            uris.add(partURI)
-
-    f.close()
-
+    with open(fileName, 'r') as f:
+        for line in f:
+            parts = line.split()
+            if parts[0] == 'H':
+                uris.add(parts[9])
     return uris
 
 def uclust2clusters():
-    # populate cluster2parts
     cluster2parts = {}
 
-    f = open(uclust_results_filename, 'r')
-    results = f.read()
-    lines = results.splitlines()
-
-    for line in lines:
-        line = line.split()
-
-        if line[0] is 'H' or line[0] is 'S':
-            part, cluster = line[8], line[1]
+    with open(uclust_results_filename, 'r') as f:
+        for line in f:
+            parts = line.split()
+            if parts[0] in ('H', 'S'):
+                part, cluster = parts[8], parts[1]
+                if cluster not in cluster2parts:
+                    cluster2parts[cluster] = set()
+                cluster2parts[cluster].add(part)
 
-            if cluster not in cluster2parts:
-                cluster2parts[cluster] = set()
-            cluster2parts[cluster].add(part)
-
-    f.close()
-
-    # transform cluster2parts to clusters
-    clusters = {}
-
-    for cluster in cluster2parts:
-        parts = cluster2parts[cluster]
-        for part in parts:
-            clusters[part] = parts.difference({part})
+    clusters = {part: parts.difference({part}) for cluster, parts in cluster2parts.items() for part in parts}
 
     return clusters
 
-
 def update_clusters():
-    utils.log_indexing('------------ Updating clusters ------------')
-    utils.log_indexing('******** Query for sequences ********')
+    logger_.log('------------ Updating clusters ------------', True)
+    logger_.log('******** Query for sequences ********', True)
     sequences_response = query.query_sparql(sequence_query)
-    utils.log_indexing('******** Query for sequences complete ********')
+    logger_.log('******** Query for sequences complete ********', True)
     write_fasta(sequences_response)
 
-    utils.log_indexing('******** Running uclust ********')
+    logger_.log('******** Running uclust ********', True)
     run_uclust()
-    utils.log_indexing('******** Running uclust complete ********')
+    logger_.log('******** Running uclust complete ********', True)
 
     analyze_uclust()
-    utils.log_indexing('------------ Successsfully updated clusters ------------\n')
+    logger_.log('------------ Successfully updated clusters ------------\n', True)
     return uclust2clusters()
-
diff --git a/flask/config.json b/flask/config.json
@@ -2,7 +2,7 @@
    "uclust_identity": "0.8",
    "elasticsearch_index_name": "part",
    "pagerank_tolerance": "0.0001",
-   "elasticsearch_endpoint": "http://localhost:9200/",
+   "elasticsearch_endpoint": "http://elasticsearch:9200/",
    "sparql_endpoint": "http://localhost:8890/sparql?",
    "last_update_start": "none",
    "last_update_end": "none",

diff --git a/flask/configManager.py b/flask/configManager.py
@@ -0,0 +1,63 @@
+import json
+import datetime
+
+class ConfigManager:
+    def __init__(self, config_file='config.json'):
+        self.config_file = config_file
+        self._config = None
+
+    def load_config(self):
+        """
+        Gets a copy of the config file
+        Returns: Config file in JSON
+
+        """
+        if self._config is None:
+            with open(self.config_file) as f:
+                self._config = json.load(f)
+        return self._config
+
+    def save_config(self, new_config):
+        """
+        Overwrites the existing config with a new config file
+        Args:
+            new_config: New config file with the updated information
+        Returns:
+        """
+        config = self.load_config()
+        config.update(new_config)
+        with open(self.config_file, 'w') as f:
+            json.dump(config, f)
+
+    def save_time(self, attribute):
+        """
+        Saves the current time to an attribute in the config
+        Args:
+            attribute: Config attribute to save current time to
+
+        Returns:
+
+        """
+        config = self.load_config()
+        config[attribute] = datetime.datetime.now().isoformat()
+        self.save_config(config)
+
+    def get_es_endpoint(self):
+        return self.load_config().get('elasticsearch_endpoint')
+
+    def save_update_end_time(self):
+        """
+        Save end time of indexing
+        Returns:
+
+        """
+        return self.save_time("last_update_end")
+
+
+    def save_update_start_time(self):
+        """
+        Save start time of indexing
+        Returns:
+
+        """
+        return self.save_time("last_update_start")
diff --git a/flask/dataManager.py b/flask/dataManager.py
@@ -0,0 +1,76 @@
+import pickle
+import os
+class DataManager:
+    def __init__(self, clusters_filename='dumps/clusters_dump', uri2rank_filename='dumps/uri2rank_dump'):
+        self.clusters_filename = clusters_filename
+        self.uri2rank_filename = uri2rank_filename
+        self._clusters = None
+        self._uri2rank = None
+
+    def save_clusters(self, clusters):
+        """
+        Save clusters of parts
+        Args:
+            new_clusters: Clusters to be saved
+
+        Returns:
+
+        """
+        self._clusters = clusters
+        self._serialize(self._clusters, self.clusters_filename)
+
+    def get_clusters(self):
+        if self._clusters is None:
+            self._clusters = self._deserialize(self.clusters_filename)
+        return self._clusters
+
+    def save_uri2rank(self, uri2rank):
+        """
+        Saves the pagerank of all URI's
+        Args:
+            new_uri2rank:
+
+        Returns:
+
+        """
+        self._uri2rank = uri2rank
+        self._serialize(self._uri2rank, self.uri2rank_filename)
+
+    def get_uri2rank(self):
+        """
+        Gets all pageranks of URI's
+        Returns:
+
+        """
+        if self._uri2rank is None:
+            self._uri2rank = self._deserialize(self.uri2rank_filename)
+        return self._uri2rank
+
+    @staticmethod
+    def _serialize(data, filename):
+        """
+        Serializes some data to a file
+        Args:
+            data: Data to be written
+            filename: File to be written to
+
+        Returns:
+
+        """
+        with open(filename, 'wb') as f:
+            pickle.dump(data, f)
+
+    @staticmethod
+    def _deserialize(filename):
+        """
+        Deserializes data from a serialized file
+        Args:
+            filename: Serialized file
+
+        Returns: Deserialized data from file
+
+        """
+        if os.path.exists(filename):
+            with open(filename, 'rb') as f:
+                return pickle.load(f)
+        return {}
diff --git a/flask/docker/Dockerfile b/flask/docker/Dockerfile
@@ -1,13 +1,25 @@
-FROM ubuntu:16.04
-MAINTAINER Michael Zhang <[email protected]>
+FROM ubuntu:22.04
+
+# Set the timezone environment variables to avoid interaction
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=America/New_York
+
+# Install tzdata without interaction
+RUN apt-get update && apt-get install -y tzdata
+
+# Set timezone
+RUN ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \
+    dpkg-reconfigure -f noninteractive tzdata
 
 RUN apt-get update && \
-    apt-get install -y software-properties-common && \
+    apt-get install -y software-properties-common coreutils && \
     add-apt-repository ppa:deadsnakes/ppa && \
     apt-get update && \
-    apt-get install -y git python3.6 python3.6-pip && \
-    python3.6 -m pip install pip --upgrade && \
-    git clone https://github.com/michael13162/SBOLExplorer.git && \
+    apt-get install -y git cron python3.11 python3-pip python3.11-venv && \
+    python3.11 -m pip install pip --upgrade && \
+    python3.11 -m venv jammy && \
+    . jammy/bin/activate && \
+    git clone https://github.com/SynBioDex/SBOLExplorer.git && \
     cd SBOLExplorer/flask && \
     pip install -r requirements.txt && \
     crontab update.cron
@@ -26,5 +38,4 @@ RUN mkdir /mnt/config && \
     rm -rf dumps && \
     ln -s /mnt/data dumps
 
-CMD "./start.sh"
-
+CMD sh -c ". ../../jammy/bin/activate && ./start.sh"