Skip to content

Commit

Permalink
Merge pull request #128 from SynBioDex/change_dependencies
Browse files Browse the repository at this point in the history
Change dependencies and optimize each python module
  • Loading branch information
cl117 authored Sep 15, 2024
2 parents d989c97 + 3370a59 commit 2306015
Show file tree
Hide file tree
Showing 19 changed files with 921 additions and 1,200 deletions.
156 changes: 57 additions & 99 deletions flask/cluster.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
from xml.etree import ElementTree
import subprocess
import utils
from configManager import ConfigManager
from logger import Logger
import query
from sys import platform


uclust_identity = utils.get_config()['uclust_identity'] # how similar sequences in the same cluster must be
config_manager = ConfigManager()
config = config_manager.load_config() # Load config once
uclust_identity = config['uclust_identity'] # Get the uclust identity value
logger_ = Logger()
sequences_filename = 'dumps/sequences.fsa'

if 'which_search' not in utils.get_config():
explorerConfig = utils.get_config()
explorerConfig['which_search'] = 'vsearch'
utils.set_config(explorerConfig)
# Ensure 'which_search' is set in config
if 'which_search' not in config:
config['which_search'] = 'vsearch'
config_manager.save_config(config)

whichSearch = utils.get_config()['which_search']
whichSearch = config['which_search']

if platform == "linux" or platform == "linux2":
if whichSearch == 'usearch':
usearch_binary_filename = 'usearch/usearch10.0.240_i86linux32'
elif whichSearch == 'vsearch':
usearch_binary_filename = 'usearch/vsearch_linux'
# Determine the correct binary filename based on OS and search tool
usearch_binary_filename = None
if platform.startswith("linux"):
usearch_binary_filename = 'usearch/vsearch_linux' if whichSearch == 'vsearch' else 'usearch/usearch10.0.240_i86linux32'
elif platform == "darwin":
if whichSearch == 'usearch':
usearch_binary_filename = 'usearch/usearch11.0.667_i86osx32'
elif whichSearch == 'vsearch':
usearch_binary_filename = 'usearch/vsearch_macos'
usearch_binary_filename = 'usearch/vsearch_macos' if whichSearch == 'vsearch' else 'usearch/usearch11.0.667_i86osx32'
else:
utils.log("Sorry, your OS is not supported for sequence based-search.")
logger_.log("Sorry, your OS is not supported for sequence-based search.")
raise SystemExit

uclust_results_filename = 'usearch/uclust_results.uc'

Expand All @@ -40,115 +40,73 @@
}
'''


def write_fasta(sequences):
f = open(sequences_filename, 'w')

for sequence in sequences:
f.write('>%s\n' % sequence['subject'])
f.write('%s\n' % sequence['sequence'])

f.close()

with open(sequences_filename, 'w') as f:
for sequence in sequences:
f.write(f">{sequence['subject']}\n{sequence['sequence']}\n")

def run_uclust():
args = [usearch_binary_filename, '-cluster_fast', sequences_filename, '-id', uclust_identity, '-sort', 'length', '-uc', uclust_results_filename]
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
utils.log_indexing(str(output))

# result = subprocess.run(args, capture_output=True, text=True) # Python3.7
result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
logger_.log(result.stdout, True)

def analyze_uclust():
f = open(uclust_results_filename, 'r')
results = f.read()

total_parts = 0
total_identity = 0.0
hits = 0

lines = results.splitlines()
for line in lines:
line = line.split()
record_type = line[0]

if record_type in ('H', 'S'):
total_parts += 1

if line[0] is 'H':
total_identity += float(line[3])
hits += 1

f.close()
utils.log_indexing('parts: ' + str(total_parts))
utils.log_indexing('hits: ' + str(hits))

with open(uclust_results_filename, 'r') as f:
for line in f:
parts = line.split()
record_type = parts[0]
if record_type in ('H', 'S'):
total_parts += 1
if record_type == 'H':
total_identity += float(parts[3])
hits += 1

logger_.log(f'parts: {total_parts}', True)
logger_.log(f'hits: {hits}', True)
if hits > 0:
utils.log_indexing('average hit identity: ' + str(total_identity / hits))

logger_.log(f'average hit identity: {total_identity / hits}', True)

def uclust2uris(fileName):
uris = set()

f = open(fileName, 'r')
results = f.read()
lines = results.splitlines()

for line in lines:
line = line.split()

if line[0] is 'H':
partURI = line[9]

uris.add(partURI)

f.close()

with open(fileName, 'r') as f:
for line in f:
parts = line.split()
if parts[0] == 'H':
uris.add(parts[9])
return uris

def uclust2clusters():
# populate cluster2parts
cluster2parts = {}

f = open(uclust_results_filename, 'r')
results = f.read()
lines = results.splitlines()

for line in lines:
line = line.split()

if line[0] is 'H' or line[0] is 'S':
part, cluster = line[8], line[1]
with open(uclust_results_filename, 'r') as f:
for line in f:
parts = line.split()
if parts[0] in ('H', 'S'):
part, cluster = parts[8], parts[1]
if cluster not in cluster2parts:
cluster2parts[cluster] = set()
cluster2parts[cluster].add(part)

if cluster not in cluster2parts:
cluster2parts[cluster] = set()
cluster2parts[cluster].add(part)

f.close()

# transform cluster2parts to clusters
clusters = {}

for cluster in cluster2parts:
parts = cluster2parts[cluster]
for part in parts:
clusters[part] = parts.difference({part})
clusters = {part: parts.difference({part}) for cluster, parts in cluster2parts.items() for part in parts}

return clusters


def update_clusters():
utils.log_indexing('------------ Updating clusters ------------')
utils.log_indexing('******** Query for sequences ********')
logger_.log('------------ Updating clusters ------------', True)
logger_.log('******** Query for sequences ********', True)
sequences_response = query.query_sparql(sequence_query)
utils.log_indexing('******** Query for sequences complete ********')
logger_.log('******** Query for sequences complete ********', True)
write_fasta(sequences_response)

utils.log_indexing('******** Running uclust ********')
logger_.log('******** Running uclust ********', True)
run_uclust()
utils.log_indexing('******** Running uclust complete ********')
logger_.log('******** Running uclust complete ********', True)

analyze_uclust()
utils.log_indexing('------------ Successsfully updated clusters ------------\n')
logger_.log('------------ Successfully updated clusters ------------\n', True)
return uclust2clusters()

2 changes: 1 addition & 1 deletion flask/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"uclust_identity": "0.8",
"elasticsearch_index_name": "part",
"pagerank_tolerance": "0.0001",
"elasticsearch_endpoint": "http://localhost:9200/",
"elasticsearch_endpoint": "http://elasticsearch:9200/",
"sparql_endpoint": "http://localhost:8890/sparql?",
"last_update_start": "none",
"last_update_end": "none",
Expand Down
63 changes: 63 additions & 0 deletions flask/configManager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import json
import datetime

class ConfigManager:
def __init__(self, config_file='config.json'):
self.config_file = config_file
self._config = None

def load_config(self):
"""
Gets a copy of the config file
Returns: Config file in JSON
"""
if self._config is None:
with open(self.config_file) as f:
self._config = json.load(f)
return self._config

def save_config(self, new_config):
"""
Overwrites the existing config with a new config file
Args:
new_config: New config file with the updated information
Returns:
"""
config = self.load_config()
config.update(new_config)
with open(self.config_file, 'w') as f:
json.dump(config, f)

def save_time(self, attribute):
"""
Saves the current time to an attribute in the config
Args:
attribute: Config attribute to save current time to
Returns:
"""
config = self.load_config()
config[attribute] = datetime.datetime.now().isoformat()
self.save_config(config)

def get_es_endpoint(self):
return self.load_config().get('elasticsearch_endpoint')

def save_update_end_time(self):
"""
Save end time of indexing
Returns:
"""
return self.save_time("last_update_end")


def save_update_start_time(self):
"""
Save start time of indexing
Returns:
"""
return self.save_time("last_update_start")
76 changes: 76 additions & 0 deletions flask/dataManager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import pickle
import os
class DataManager:
def __init__(self, clusters_filename='dumps/clusters_dump', uri2rank_filename='dumps/uri2rank_dump'):
self.clusters_filename = clusters_filename
self.uri2rank_filename = uri2rank_filename
self._clusters = None
self._uri2rank = None

def save_clusters(self, clusters):
"""
Save clusters of parts
Args:
new_clusters: Clusters to be saved
Returns:
"""
self._clusters = clusters
self._serialize(self._clusters, self.clusters_filename)

def get_clusters(self):
if self._clusters is None:
self._clusters = self._deserialize(self.clusters_filename)
return self._clusters

def save_uri2rank(self, uri2rank):
"""
Saves the pagerank of all URI's
Args:
new_uri2rank:
Returns:
"""
self._uri2rank = uri2rank
self._serialize(self._uri2rank, self.uri2rank_filename)

def get_uri2rank(self):
"""
Gets all pageranks of URI's
Returns:
"""
if self._uri2rank is None:
self._uri2rank = self._deserialize(self.uri2rank_filename)
return self._uri2rank

@staticmethod
def _serialize(data, filename):
"""
Serializes some data to a file
Args:
data: Data to be written
filename: File to be written to
Returns:
"""
with open(filename, 'wb') as f:
pickle.dump(data, f)

@staticmethod
def _deserialize(filename):
"""
Deserializes data from a serialized file
Args:
filename: Serialized file
Returns: Deserialized data from file
"""
if os.path.exists(filename):
with open(filename, 'rb') as f:
return pickle.load(f)
return {}
27 changes: 19 additions & 8 deletions flask/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
FROM ubuntu:16.04
MAINTAINER Michael Zhang <[email protected]>
FROM ubuntu:22.04

# Set the timezone environment variables to avoid interaction
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=America/New_York

# Install tzdata without interaction
RUN apt-get update && apt-get install -y tzdata

# Set timezone
RUN ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \
dpkg-reconfigure -f noninteractive tzdata

RUN apt-get update && \
apt-get install -y software-properties-common && \
apt-get install -y software-properties-common coreutils && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y git python3.6 python3.6-pip && \
python3.6 -m pip install pip --upgrade && \
git clone https://github.com/michael13162/SBOLExplorer.git && \
apt-get install -y git cron python3.11 python3-pip python3.11-venv && \
python3.11 -m pip install pip --upgrade && \
python3.11 -m venv jammy && \
. jammy/bin/activate && \
git clone https://github.com/SynBioDex/SBOLExplorer.git && \
cd SBOLExplorer/flask && \
pip install -r requirements.txt && \
crontab update.cron
Expand All @@ -26,5 +38,4 @@ RUN mkdir /mnt/config && \
rm -rf dumps && \
ln -s /mnt/data dumps

CMD "./start.sh"

CMD sh -c ". ../../jammy/bin/activate && ./start.sh"
Loading

0 comments on commit 2306015

Please sign in to comment.