Skip to content

Commit

Permalink
fix create_db-schema; fix tests; reduce memory use
Browse files Browse the repository at this point in the history
  • Loading branch information
colin-combe committed Apr 19, 2024
1 parent a418a9b commit 3a67c4b
Show file tree
Hide file tree
Showing 11 changed files with 79 additions and 28 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ credentials.py

test.db
database.ini
config/database.ini
kubernetes.yml
Dockerfile
xi_mzidentml_converter.egg-info
Expand Down
71 changes: 61 additions & 10 deletions parser/MzIdParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
import obonet
from pyteomics import mzid # https://pyteomics.readthedocs.io/en/latest/data.html#controlled-vocabularies
from pyteomics.auxiliary import cvquery
from sqlalchemy import Table
from pyteomics.xml import _local_name
from lxml import etree
from sqlalchemy.exc import SQLAlchemyError

from parser import writer
from parser.api_writer import APIWriter
from parser.peaklistReader.PeakListWrapper import PeakListWrapper

Expand Down Expand Up @@ -74,7 +74,6 @@ def __init__(self, mzid_path, temp_dir, peak_list_dir, writer, logger):

self.logger.info('reading mzid - done. Time: {} sec'.format(round(time() - start_time, 2)))


def parse(self):
"""Parse the file."""
start_time = time()
Expand Down Expand Up @@ -585,8 +584,15 @@ def main_loop(self):

# iterate over all the spectrum identification lists
for sil_id in self.mzid_reader._offset_index["SpectrumIdentificationList"].keys():
sil = self.mzid_reader.get_by_id(sil_id, tag_id='SpectrumIdentificationList')
for sid_result in sil['SpectrumIdentificationResult']:
# sil = self.mzid_reader.get_by_id(sil_id, tag_id='SpectrumIdentificationList')
self.mzid_reader.reset()
for sid_result in iterfind_when(
self.mzid_reader,
"SpectrumIdentificationResult",
"SpectrumIdentificationList",
lambda x: x.attrib["id"] == sil_id,
retrieve_refs=False
):
if self.peak_list_dir:
peak_list_reader = self.peak_list_readers[sid_result['spectraData_ref']]

Expand Down Expand Up @@ -667,7 +673,7 @@ def main_loop(self):
'scores': scores,
'exp_mz': spec_id_item['experimentalMassToCharge'],
'calc_mz': calculated_mass_to_charge,
'sil_id': sil['id'],
'sil_id': sil_id,
}

spectrum_ident_dict[crosslink_id] = ident_data
Expand Down Expand Up @@ -695,9 +701,10 @@ def main_loop(self):
db_wrap_up_start_time = time()
self.logger.info('write remaining entries to DB - start')

if self.peak_list_dir:
if self.peak_list_dir and spectra: # spectra is not empty
self.writer.write_data('spectrum', spectra)
self.writer.write_data('spectrumidentification', spectrum_identifications)
if spectrum_identifications: # spectrum_identifications is not empty
self.writer.write_data('spectrumidentification', spectrum_identifications)

self.logger.info('write remaining entries to DB - done. Time: {} sec'.format(
round(time() - db_wrap_up_start_time, 2)))
Expand Down Expand Up @@ -745,7 +752,8 @@ def upload_info(self):
bib_refs.append(bib)
self.mzid_reader.reset()

self.writer.write_mzid_info(analysis_software_list, spectra_formats, provider, audits, samples, bib_refs, self.writer.upload_id)
self.writer.write_mzid_info(analysis_software_list, spectra_formats, provider, audits, samples, bib_refs,
self.writer.upload_id)

self.logger.info('getting upload info - done Time: {} sec'.format(
round(time() - upload_info_start_time, 2)))
Expand All @@ -766,7 +774,7 @@ def write_new_upload(self):

response = self.writer.write_new_upload(table, upload_data)
if response:
self.writer.upload_id =int(response)
self.writer.upload_id = int(response)
else:
raise Exception("Response is not available to create a upload ID")
except SQLAlchemyError as e:
Expand Down Expand Up @@ -847,6 +855,49 @@ def extract_mzid(archive):
else:
raise BaseException('unsupported file type: %s' % archive)

def iterfind_when(source, target_name, condition_name, stack_predicate, **kwargs):
"""
Iteratively parse XML stream in ``source``, yielding XML elements
matching ``target_name`` as long as earlier in the tree a ``condition_name`` element
satisfies ``stack_predicate``, a callable that takes a single :class:`etree.Element` and returns
a :class:`bool`.
Parameters
----------
source: file-like
A file-like object over an XML document
target_name: str
The name of the XML tag to parse until
condition_name: str
The name to start parsing at when `stack_predicate` evaluates to true on this element.
stack_predicate: callable
A function called with a single `etree.Element` that determines if the sub-tree should be parsed
**kwargs:
Additional arguments passed to :meth:`source._get_info_smart`
Yields
------
lxml.etree.Element
"""
g = etree.iterparse(source, ("start", "end"))
state = False
history = []
for event, tag in g:
lc_name = _local_name(tag)
if event == "start":
if lc_name == condition_name:
state = stack_predicate(tag)
else:
if lc_name == target_name and state:
value = source._get_info_smart(tag, **kwargs)
for t in history:
t.clear()
history.clear()
yield value
elif state:
history.append(tag)
elif not state:
tag.clear()

class xiSPEC_MzIdParser(MzIdParser):

Expand Down
1 change: 1 addition & 0 deletions parser/database/create_db_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from sqlalchemy_utils import database_exists, drop_database, create_database

from models.base import Base
from models import *


def create_db(connection_str):
Expand Down
5 changes: 2 additions & 3 deletions parser/peaklistReader/PeakListWrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,8 @@ def load(self, source, file_name=None, source_path=None):
:param file_name: (str) mzML filename
:param source_path: (str) path to the source file (mzML or archive)
"""
self._reader = mzml.read(source)
if self._reader.index is None:
self._reader = mzml.read(source, use_index=True)

self._reader = mzml.read(source, use_index=True, huge_tree=True)
super().load(source, file_name, source_path)

def reset(self):
Expand Down
14 changes: 7 additions & 7 deletions tests/parse_csv.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from parser import FullCsvParser, NoPeakListsCsvParser, LinksOnlyCsvParser
from parser.api_writer import Writer
from parser.database_writer import DatabaseWriter
from sqlalchemy import text
from uuid import uuid4
from .db_pytest_fixtures import *
Expand All @@ -9,7 +9,7 @@ def parse_full_csv_into_postgresql(mzid_file, peaklist, tmpdir, logger, use_data
# create temp user for user_id
user_id = 1
# create writer
writer = Writer(engine.url, user_id)
writer = DatabaseWriter(engine.url, user_id)
engine.dispose()

# parse the mzid file
Expand All @@ -23,7 +23,7 @@ def parse_no_peak_lists_csv_into_postgresql(mzid_file, peaklist, tmpdir, logger,
# create temp user for user_id
user_id = 1
# create writer
writer = Writer(engine.url, user_id)
writer = DatabaseWriter(engine.url, user_id)
engine.dispose()

# parse the mzid file
Expand All @@ -37,7 +37,7 @@ def parse_links_only_csv_into_postgresql(mzid_file, peaklist, tmpdir, logger, us
# create temp user for user_id
user_id = 1
# create writer
writer = Writer(engine.url, user_id)
writer = DatabaseWriter(engine.url, user_id)
engine.dispose()

# parse the mzid file
Expand All @@ -49,7 +49,7 @@ def parse_links_only_csv_into_postgresql(mzid_file, peaklist, tmpdir, logger, us

def parse_full_csv_into_sqllite(mzid_file, peaklist, tmpdir, logger, use_database, engine):
# create writer
writer = Writer(engine.url)
writer = DatabaseWriter(engine.url)
engine.dispose()

# parse the mzid file
Expand All @@ -61,7 +61,7 @@ def parse_full_csv_into_sqllite(mzid_file, peaklist, tmpdir, logger, use_databas

def parse_no_peak_lists_csv_into_sqllite(mzid_file, peaklist, tmpdir, logger, use_database, engine):
# create writer
writer = Writer(engine.url)
writer = DatabaseWriter(engine.url)
engine.dispose()

# parse the mzid file
Expand All @@ -73,7 +73,7 @@ def parse_no_peak_lists_csv_into_sqllite(mzid_file, peaklist, tmpdir, logger, us

def parse_links_only_csv_into_sqllite(mzid_file, peaklist, tmpdir, logger, use_database, engine):
# create writer
writer = Writer(engine.url)
writer = DatabaseWriter(engine.url)
engine.dispose()

# parse the mzid file
Expand Down
6 changes: 3 additions & 3 deletions tests/parse_mzid.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from parser import MzIdParser
from parser.api_writer import Writer
from parser.database_writer import DatabaseWriter
from sqlalchemy import text
from uuid import uuid4
from .db_pytest_fixtures import *
Expand All @@ -9,7 +9,7 @@ def parse_mzid_into_postgresql(mzid_file, peaklist, tmpdir, logger, use_database
# create temp user for user_id
user_id = 1
# create writer
writer = Writer(engine.url, user_id)
writer = DatabaseWriter(engine.url, user_id)
engine.dispose()

# parse the mzid file
Expand All @@ -20,7 +20,7 @@ def parse_mzid_into_postgresql(mzid_file, peaklist, tmpdir, logger, use_database

def parse_mzid_into_sqlite_xispec(mzid_file, peaklist, tmpdir, logger, engine):
# create writer
writer = Writer(engine.url)
writer = DatabaseWriter(engine.url)
engine.dispose()

# parse the mzid file
Expand Down
2 changes: 1 addition & 1 deletion tests/test_CsvParsers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from parser.api_writer import Table
from sqlalchemy import Table
import logging
from parser.peaklistReader.PeakListWrapper import PeakListWrapper
from .db_pytest_fixtures import *
Expand Down
2 changes: 1 addition & 1 deletion tests/test_MzIdParser_ecoli_dsso.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
import numpy as np
from numpy.testing import assert_array_equal
from parser.api_writer import Table
from sqlalchemy import Table
import os
import logging
from sqlalchemy import text
Expand Down
2 changes: 1 addition & 1 deletion tests/test_MzIdParser_matrixscience.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from parser.api_writer import Writer, Table
from sqlalchemy import Table
import os
import logging
from .db_pytest_fixtures import *
Expand Down
2 changes: 1 addition & 1 deletion tests/test_MzIdParser_matrixscience2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from parser.api_writer import Writer, Table
from sqlalchemy import Table
import os
import logging
from .db_pytest_fixtures import *
Expand Down
1 change: 0 additions & 1 deletion tests/test_fasta_reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os

from parser import SimpleFASTA
from parser.api_writer import Table
import logging
from parser.peaklistReader.PeakListWrapper import PeakListWrapper
from .db_pytest_fixtures import *
Expand Down

0 comments on commit 3a67c4b

Please sign in to comment.