From 7553df10854d8f3b50c7ea632575a9f133748c88 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Thu, 29 Feb 2024 11:16:46 +0100 Subject: [PATCH 01/13] Try-except rsync using subprocess --- ExonCov/cli.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/ExonCov/cli.py b/ExonCov/cli.py index 2771d09..e698b84 100644 --- a/ExonCov/cli.py +++ b/ExonCov/cli.py @@ -3,7 +3,7 @@ import sys import re import time -import subprocess +from subprocess import run as subprocess_run, PIPE, Popen, CalledProcessError import os import shlex import urllib @@ -196,7 +196,7 @@ def run(self, project_name, sample_type, bam, exon_bed_file, threads, overwrite, temp_dir = temp_path # Run sambamba - p = subprocess.Popen(shlex.split(sambamba_command), stdout=subprocess.PIPE, encoding='utf-8') + p = Popen(shlex.split(sambamba_command), stdout=PIPE, encoding='utf-8') exon_measurement_file_path = '{0}/{1}'.format(temp_dir, sample.exon_measurement_file) with open(exon_measurement_file_path, "w") as exon_measurement_file: for line in p.stdout: @@ -271,7 +271,16 @@ def run(self, project_name, sample_type, bam, exon_bed_file, threads, overwrite, exon_measurement_file_path_gz = '{0}.gz'.format(exon_measurement_file_path) pysam.tabix_compress(exon_measurement_file_path, exon_measurement_file_path_gz) pysam.tabix_index(exon_measurement_file_path_gz, seq_col=0, start_col=1, end_col=2) - os.system('rsync {0}* {1}'.format(exon_measurement_file_path_gz, app.config['EXON_MEASUREMENTS_RSYNC_PATH'])) + + # external subprocess + try: + command_result = subprocess_run( + f"rsync {exon_measurement_file_path_gz}* {app.config['EXON_MEASUREMENTS_RSYNC_PATH']}", + shell=True, stdout=PIPE + ) + command_result.check_returncode() + except CalledProcessError: + sys.exit(f'ERROR: Rsync unsuccesful, returncode {command_result.returncode}.') # Change exon_measurement_file to path on server. sample = Sample.query.get(sample_id) From 300654d7b7b5e7532e405007d5458e746b52d2d8 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Thu, 29 Feb 2024 11:21:26 +0100 Subject: [PATCH 02/13] remove import os --- ExonCov/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ExonCov/cli.py b/ExonCov/cli.py index e698b84..c0a5bbc 100644 --- a/ExonCov/cli.py +++ b/ExonCov/cli.py @@ -4,7 +4,6 @@ import re import time from subprocess import run as subprocess_run, PIPE, Popen, CalledProcessError -import os import shlex import urllib import datetime From 88efadc2bb38b74625613d202a93401514af4c75 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Thu, 29 Feb 2024 11:25:52 +0100 Subject: [PATCH 03/13] subprocess_run outside try. --- ExonCov/cli.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ExonCov/cli.py b/ExonCov/cli.py index c0a5bbc..72b652c 100644 --- a/ExonCov/cli.py +++ b/ExonCov/cli.py @@ -271,12 +271,12 @@ def run(self, project_name, sample_type, bam, exon_bed_file, threads, overwrite, pysam.tabix_compress(exon_measurement_file_path, exon_measurement_file_path_gz) pysam.tabix_index(exon_measurement_file_path_gz, seq_col=0, start_col=1, end_col=2) - # external subprocess + # External subprocess + command_result = subprocess_run( + f"rsync {exon_measurement_file_path_gz}* {app.config['EXON_MEASUREMENTS_RSYNC_PATH']}", shell=True, stdout=PIPE + ) + # Check returncode and raise CalledProcessError if non-zero. try: - command_result = subprocess_run( - f"rsync {exon_measurement_file_path_gz}* {app.config['EXON_MEASUREMENTS_RSYNC_PATH']}", - shell=True, stdout=PIPE - ) command_result.check_returncode() except CalledProcessError: sys.exit(f'ERROR: Rsync unsuccesful, returncode {command_result.returncode}.') From 0caccf55a00fbeb8e53a5e3a4a9de8d506584c38 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Thu, 29 Feb 2024 11:58:33 +0100 Subject: [PATCH 04/13] Move compres, index and rsync exon after transcript measurements --- ExonCov/cli.py | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/ExonCov/cli.py b/ExonCov/cli.py index 72b652c..795b856 100644 --- a/ExonCov/cli.py +++ b/ExonCov/cli.py @@ -266,30 +266,6 @@ def run(self, project_name, sample_type, bam, exon_bed_file, threads, overwrite, 'measurement_percentage100': measurement_percentage100, } - # Compress, index and rsync - exon_measurement_file_path_gz = '{0}.gz'.format(exon_measurement_file_path) - pysam.tabix_compress(exon_measurement_file_path, exon_measurement_file_path_gz) - pysam.tabix_index(exon_measurement_file_path_gz, seq_col=0, start_col=1, end_col=2) - - # External subprocess - command_result = subprocess_run( - f"rsync {exon_measurement_file_path_gz}* {app.config['EXON_MEASUREMENTS_RSYNC_PATH']}", shell=True, stdout=PIPE - ) - # Check returncode and raise CalledProcessError if non-zero. - try: - command_result.check_returncode() - except CalledProcessError: - sys.exit(f'ERROR: Rsync unsuccesful, returncode {command_result.returncode}.') - - # Change exon_measurement_file to path on server. - sample = Sample.query.get(sample_id) - sample.exon_measurement_file = '{0}/{1}.gz'.format( - app.config['EXON_MEASUREMENTS_RSYNC_PATH'].split(':')[-1], - sample.exon_measurement_file - ) - db.session.add(sample) - db.session.commit() - # Set transcript measurements transcripts = Transcript.query.options(joinedload('exons')).all() transcripts_measurements = {} @@ -326,6 +302,30 @@ def run(self, project_name, sample_type, bam, exon_bed_file, threads, overwrite, db.session.bulk_insert_mappings(TranscriptMeasurement, transcript_values[i:i+bulk_insert_n]) db.session.commit() + # Compress, index and rsync exon_measurements + exon_measurement_file_path_gz = '{0}.gz'.format(exon_measurement_file_path) + pysam.tabix_compress(exon_measurement_file_path, exon_measurement_file_path_gz) + pysam.tabix_index(exon_measurement_file_path_gz, seq_col=0, start_col=1, end_col=2) + + # External subprocess + command_result = subprocess_run( + f"rsync {exon_measurement_file_path_gz}* {app.config['EXON_MEASUREMENTS_RSYNC_PATH']}", shell=True, stdout=PIPE + ) + # Check returncode and raise CalledProcessError if non-zero. + try: + command_result.check_returncode() + except CalledProcessError: + sys.exit(f'ERROR: Rsync unsuccesful, returncode {command_result.returncode}.') + + # Change exon_measurement_file to path on server. + sample = Sample.query.get(sample_id) + sample.exon_measurement_file = '{0}/{1}.gz'.format( + app.config['EXON_MEASUREMENTS_RSYNC_PATH'].split(':')[-1], + sample.exon_measurement_file + ) + db.session.add(sample) + db.session.commit() + # Remove temp_dir if not temp_path: shutil.rmtree(temp_dir) From 7fcb8560dac773564bf55609a24dd7d009d80f5a Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Tue, 12 Mar 2024 16:01:53 +0100 Subject: [PATCH 05/13] Fix joinedload --- ExonCov/views.py | 124 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 93 insertions(+), 31 deletions(-) diff --git a/ExonCov/views.py b/ExonCov/views.py index 6838aa6..3cb51e5 100644 --- a/ExonCov/views.py +++ b/ExonCov/views.py @@ -13,7 +13,7 @@ from . import app, db from .models import ( Sample, SampleProject, SampleSet, SequencingRun, PanelVersion, Panel, CustomPanel, Gene, Transcript, - TranscriptMeasurement, panels_transcripts + TranscriptMeasurement, panels_transcripts, Exon ) from .forms import ( MeasurementTypeForm, CustomPanelForm, CustomPanelNewForm, CustomPanelValidateForm, SampleForm, @@ -44,8 +44,8 @@ def samples(): Sample.query .order_by(Sample.import_date.desc()) .order_by(Sample.name.asc()) - .options(joinedload('sequencing_runs')) - .options(joinedload('project')) + .options(joinedload(Sample.sequencing_runs)) + .options(joinedload(Sample.project)) ) if (sample or project or run or sample_type) and sample_form.validate(): @@ -79,9 +79,7 @@ def sample(id): # Query Sample and panels sample = ( Sample.query - .options(joinedload('sequencing_runs')) - .options(joinedload('project')) - .options(joinedload('custom_panels')) + .options(joinedload(Sample.sequencing_runs), joinedload(Sample.project), joinedload(Sample.custom_panels)) .get_or_404(id) ) query = ( @@ -92,6 +90,7 @@ def sample(id): .join(TranscriptMeasurement) .filter_by(sample_id=sample.id) .order_by(PanelVersion.panel_name) + .options(joinedload(PanelVersion.panel)) .all() ) measurement_types = { @@ -127,7 +126,7 @@ def sample(id): @login_required def sample_inactive_panels(id): """Sample inactive panels page.""" - sample = Sample.query.options(joinedload('sequencing_runs')).options(joinedload('project')).get_or_404(id) + sample = Sample.query.options(joinedload(Sample.sequencing_runs)).options(joinedload(Sample.project)).get_or_404(id) measurement_types = { 'measurement_mean_coverage': 'Mean coverage', 'measurement_percentage10': '>10', @@ -135,7 +134,17 @@ def sample_inactive_panels(id): 'measurement_percentage30': '>30', 'measurement_percentage100': '>100', } - query = db.session.query(PanelVersion, TranscriptMeasurement).filter_by(active=False).filter_by(validated=True).join(Transcript, PanelVersion.transcripts).join(TranscriptMeasurement).filter_by(sample_id=sample.id).order_by(PanelVersion.panel_name).all() + query = ( + db.session.query(PanelVersion, TranscriptMeasurement) + .filter_by(active=False) + .filter_by(validated=True) + .join(Transcript, PanelVersion.transcripts) + .join(TranscriptMeasurement) + .filter_by(sample_id=sample.id) + .order_by(PanelVersion.panel_name) + .options(joinedload(PanelVersion.panel)) + .all() + ) panels = {} for panel, transcript_measurement in query: @@ -162,8 +171,8 @@ def sample_inactive_panels(id): @login_required def sample_panel(sample_id, panel_id): """Sample panel page.""" - sample = Sample.query.options(joinedload('sequencing_runs')).options(joinedload('project')).get_or_404(sample_id) - panel = PanelVersion.query.options(joinedload('core_genes')).get_or_404(panel_id) + sample = Sample.query.options(joinedload(Sample.sequencing_runs)).options(joinedload(Sample.project)).get_or_404(sample_id) + panel = PanelVersion.query.options(joinedload(PanelVersion.core_genes)).get_or_404(panel_id) measurement_types = { 'measurement_mean_coverage': 'Mean coverage', @@ -213,8 +222,8 @@ def sample_panel(sample_id, panel_id): @login_required def sample_transcript(sample_id, transcript_name): """Sample transcript page.""" - sample = Sample.query.options(joinedload('sequencing_runs')).options(joinedload('project')).get_or_404(sample_id) - transcript = Transcript.query.filter_by(name=transcript_name).options(joinedload('exons')).first() + sample = Sample.query.options(joinedload(Sample.sequencing_runs)).options(joinedload(Sample.project)).get_or_404(sample_id) + transcript = Transcript.query.filter_by(name=transcript_name).options(joinedload(Transcript.exons)).first() exon_measurements = [] try: @@ -247,7 +256,7 @@ def sample_transcript(sample_id, transcript_name): @login_required def sample_gene(sample_id, gene_id): """Sample gene page.""" - sample = Sample.query.options(joinedload('sequencing_runs')).options(joinedload('project')).get_or_404(sample_id) + sample = Sample.query.options(joinedload(Sample.sequencing_runs)).options(joinedload(Sample.project)).get_or_404(sample_id) gene = Gene.query.get_or_404(gene_id) measurement_types = { @@ -266,7 +275,7 @@ def sample_gene(sample_id, gene_id): @login_required def panels(): """Panel overview page.""" - panels = Panel.query.options(joinedload('versions')).all() + panels = Panel.query.options(joinedload(Panel.versions)).all() return render_template('panels.html', panels=panels, custom_panels=custom_panels) @@ -274,7 +283,7 @@ def panels(): @login_required def panel(name): """Panel page.""" - panel = Panel.query.filter_by(name=name).options(joinedload('versions').joinedload('transcripts')).first_or_404() + panel = Panel.query.filter_by(name=name).options(joinedload(Panel.versions).joinedload(PanelVersion.transcripts)).first_or_404() return render_template('panel.html', panel=panel) @@ -283,7 +292,7 @@ def panel(name): @roles_required('panel_admin') def panel_new_version(name): """Create new panel version page.""" - panel = Panel.query.filter_by(name=name).options(joinedload('versions').joinedload('transcripts')).first_or_404() + panel = Panel.query.filter_by(name=name).options(joinedload(Panel.versions).joinedload(PanelVersion.transcripts)).first_or_404() panel_last_version = panel.last_version genes = '\n'.join([transcript.gene_id for transcript in panel_last_version.transcripts]) @@ -409,7 +418,14 @@ def panel_new(): @login_required def panel_version(id): """PanelVersion page.""" - panel = PanelVersion.query.options(joinedload('transcripts').joinedload('exons')).options(joinedload('transcripts').joinedload('gene')).get_or_404(id) + panel = ( + PanelVersion.query + .options( + joinedload(PanelVersion.transcripts).joinedload(Transcript.exons), + joinedload(PanelVersion.transcripts).joinedload(Transcript.gene) + ) + .get_or_404(id) + ) return render_template('panel_version.html', panel=panel) @@ -500,7 +516,12 @@ def custom_panel_new(): @login_required def custom_panel(id): """Custom panel page.""" - custom_panel = CustomPanel.query.options(joinedload('transcripts')).options(joinedload('samples')).get_or_404(id) + custom_panel = ( + CustomPanel.query + .options(joinedload(CustomPanel.transcripts)) + .options(joinedload(CustomPanel.samples)) + .get_or_404(id) + ) measurement_type_form = MeasurementTypeForm() sample_ids = [sample.id for sample in custom_panel.samples] @@ -510,7 +531,13 @@ def custom_panel(id): panel_measurements = {} sample_stats = {sample: [[], {}] for sample in custom_panel.samples} - query = TranscriptMeasurement.query.filter(TranscriptMeasurement.sample_id.in_(sample_ids)).filter(TranscriptMeasurement.transcript_id.in_(transcript_ids)).options(joinedload('transcript').joinedload('gene')).all() + query = ( + TranscriptMeasurement.query + .filter(TranscriptMeasurement.sample_id.in_(sample_ids)) + .filter(TranscriptMeasurement.transcript_id.in_(transcript_ids)) + .options(joinedload(TranscriptMeasurement.transcript).joinedload(Transcript.gene)) + .all() + ) for transcript_measurement in query: sample = transcript_measurement.sample @@ -551,11 +578,17 @@ def custom_panel(id): @login_required def custom_panel_transcript(id, transcript_name): """Custom panel transcript page.""" - custom_panel = CustomPanel.query.options(joinedload('samples')).get_or_404(id) + custom_panel = CustomPanel.query.options(joinedload(CustomPanel.samples)).get_or_404(id) measurement_type_form = MeasurementTypeForm() sample_ids = [sample.id for sample in custom_panel.samples] - transcript = Transcript.query.filter_by(name=transcript_name).options(joinedload('gene')).options(joinedload('exons')).first() + transcript = ( + Transcript.query + .filter_by(name=transcript_name) + .options(joinedload(Transcript.gene)) + .options(joinedload(Transcript.exons)) + .first() + ) measurement_type = [measurement_type_form.data['measurement_type'], dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type'])] transcript_measurements = {} exon_measurements = {} @@ -612,7 +645,15 @@ def custom_panel_gene(id, gene_id): transcript_measurements = {} if sample_ids and measurement_type: - query = TranscriptMeasurement.query.filter(TranscriptMeasurement.sample_id.in_(sample_ids)).join(Transcript).filter_by(gene_id=gene_id).options(joinedload('sample')).options(joinedload('transcript')).all() + query = ( + TranscriptMeasurement.query + .filter(TranscriptMeasurement.sample_id.in_(sample_ids)) + .join(Transcript) + .filter_by(gene_id=gene_id) + .options(joinedload(TranscriptMeasurement.sample)) + .options(joinedload(TranscriptMeasurement.transcript)) + .all() + ) for transcript_measurement in query: sample = transcript_measurement.sample transcript = transcript_measurement.transcript @@ -651,7 +692,7 @@ def custom_panel_validated(id): @login_required def sample_sets(): """Sample sets page.""" - sample_sets = SampleSet.query.options(joinedload('samples')).filter_by(active=True).all() + sample_sets = SampleSet.query.options(joinedload(SampleSet.samples)).filter_by(active=True).all() panel_gene_form = SampleSetPanelGeneForm() @@ -668,7 +709,7 @@ def sample_sets(): @login_required def sample_set(id): """Sample set page.""" - sample_set = SampleSet.query.options(joinedload('samples')).get_or_404(id) + sample_set = SampleSet.query.options(joinedload(SampleSet.samples)).get_or_404(id) measurement_type_form = MeasurementTypeForm() sample_ids = [sample.id for sample in sample_set.samples] @@ -686,6 +727,7 @@ def sample_set(id): .join(TranscriptMeasurement) .filter(TranscriptMeasurement.sample_id.in_(sample_ids)) .order_by(PanelVersion.panel_name) + .options(joinedload(PanelVersion.panel)) .all() ) @@ -718,8 +760,8 @@ def sample_set(id): @login_required def sample_set_panel(sample_set_id, panel_id): """Sample set panel page.""" - sample_set = SampleSet.query.options(joinedload('samples')).get_or_404(sample_set_id) - panel = PanelVersion.query.options(joinedload('transcripts')).get_or_404(panel_id) + sample_set = SampleSet.query.options(joinedload(SampleSet.samples)).get_or_404(sample_set_id) + panel = PanelVersion.query.options(joinedload(PanelVersion.transcripts)).get_or_404(panel_id) measurement_type_form = MeasurementTypeForm() sample_ids = [sample.id for sample in sample_set.samples] @@ -727,7 +769,13 @@ def sample_set_panel(sample_set_id, panel_id): measurement_type = [measurement_type_form.data['measurement_type'], dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type'])] transcript_measurements = {} - query = TranscriptMeasurement.query.filter(TranscriptMeasurement.sample_id.in_(sample_ids)).filter(TranscriptMeasurement.transcript_id.in_(transcript_ids)).options(joinedload('transcript').joinedload('gene')).all() + query = ( + TranscriptMeasurement.query + .filter(TranscriptMeasurement.sample_id.in_(sample_ids)) + .filter(TranscriptMeasurement.transcript_id.in_(transcript_ids)) + .options(joinedload(TranscriptMeasurement.transcript).joinedload(Transcript.gene)) + .all() + ) for transcript_measurement in query: sample = transcript_measurement.sample @@ -748,8 +796,14 @@ def sample_set_panel(sample_set_id, panel_id): @login_required def sample_set_transcript(sample_set_id, transcript_name): """Sample set transcript page.""" - sample_set = SampleSet.query.options(joinedload('samples')).get_or_404(sample_set_id) - transcript = Transcript.query.filter_by(name=transcript_name).options(joinedload('gene')).options(joinedload('exons')).first() + sample_set = SampleSet.query.options(joinedload(SampleSet.samples)).get_or_404(sample_set_id) + transcript = ( + Transcript.query + .filter_by(name=transcript_name) + .options(joinedload(Transcript.gene)) + .options(joinedload(Transcript.exons)) + .first() + ) measurement_type_form = MeasurementTypeForm() measurement_type = [measurement_type_form.data['measurement_type'], dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type'])] @@ -785,7 +839,7 @@ def sample_set_transcript(sample_set_id, transcript_name): @login_required def sample_set_gene(sample_set_id, gene_id): """Sample set gene page.""" - sample_set = SampleSet.query.options(joinedload('samples')).get_or_404(sample_set_id) + sample_set = SampleSet.query.options(joinedload(SampleSet.samples)).get_or_404(sample_set_id) gene = Gene.query.get_or_404(gene_id) measurement_type_form = MeasurementTypeForm() @@ -794,7 +848,15 @@ def sample_set_gene(sample_set_id, gene_id): transcript_measurements = {} if sample_ids and measurement_type: - query = TranscriptMeasurement.query.filter(TranscriptMeasurement.sample_id.in_(sample_ids)).join(Transcript).filter_by(gene_id=gene_id).options(joinedload('sample')).options(joinedload('transcript')).all() + query = ( + TranscriptMeasurement.query + .filter(TranscriptMeasurement.sample_id.in_(sample_ids)) + .join(Transcript) + .filter_by(gene_id=gene_id) + .options(joinedload(TranscriptMeasurement.sample)) + .options(joinedload(TranscriptMeasurement.transcript)) + .all() + ) for transcript_measurement in query: sample = transcript_measurement.sample transcript = transcript_measurement.transcript From a09f9bd6f65662309cc82984dfc560a0fa56d341 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Tue, 12 Mar 2024 16:47:06 +0100 Subject: [PATCH 06/13] Fix sort --- ExonCov/views.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ExonCov/views.py b/ExonCov/views.py index 3cb51e5..ae652a1 100644 --- a/ExonCov/views.py +++ b/ExonCov/views.py @@ -2,6 +2,7 @@ import time import datetime +from operator import attrgetter from flask import render_template, request, redirect, url_for from flask_security import login_required, roles_required @@ -307,7 +308,7 @@ def panel_new_version(name): transcripts = panel_new_version_form.transcripts # Check for panel changes - if sorted(transcripts) == sorted(panel_last_version.transcripts): + if sorted(transcripts, key=attrgetter('id')) == sorted(panel_last_version.transcripts, key=attrgetter('id')): panel_new_version_form.gene_list.errors.append('No changes.') else: From 9f03ecbac6699b8f7246874680b5ff2ddf10f619 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Tue, 12 Mar 2024 16:50:20 +0100 Subject: [PATCH 07/13] Fix webapp on python 3.11 --- ExonCov.py | 2 +- ExonCov/__init__.py | 3 +- ExonCov/forms.py | 12 +-- ExonCov/models.py | 2 + .../743f453ff85b_add_fs_uniquifier.py | 84 +++++++++++++++++++ requirements.txt | 29 +++---- 6 files changed, 107 insertions(+), 25 deletions(-) create mode 100644 migrations/versions/743f453ff85b_add_fs_uniquifier.py diff --git a/ExonCov.py b/ExonCov.py index 5b19828..8f08595 100755 --- a/ExonCov.py +++ b/ExonCov.py @@ -6,7 +6,7 @@ from ExonCov import app, db, cli manager = Manager(app) -migrate = Migrate(app, db) +# migrate = Migrate(app, db) db_manager = Manager(usage='Database commands.') manager.add_command("db", db_manager) diff --git a/ExonCov/__init__.py b/ExonCov/__init__.py index 62b8bee..85670c8 100644 --- a/ExonCov/__init__.py +++ b/ExonCov/__init__.py @@ -8,13 +8,14 @@ import flask_admin from flask_security import Security, SQLAlchemyUserDatastore from flask_debugtoolbar import DebugToolbarExtension - +from flask_migrate import Migrate from ExonCov.utils import url_for_other_page, event_logger # Setup APP app = Flask(__name__) app.config.from_object('config') db = SQLAlchemy(app) +migrate = Migrate(app, db) csrf = CSRFProtect(app) admin = flask_admin.Admin(app, name='ExonCov Admin', template_mode='bootstrap3') diff --git a/ExonCov/forms.py b/ExonCov/forms.py index 0fd3765..904c562 100644 --- a/ExonCov/forms.py +++ b/ExonCov/forms.py @@ -132,7 +132,7 @@ class CustomPanelNewForm(FlaskForm): comments = TextAreaField('Comments', description="Provide a short description.") transcripts = [] # Filled in validate function - def validate(self): + def validate(self, extra_validators=[]): """Extra validation, used to validate gene list.""" # Default validation as defined in field validators self.transcripts = [] # Reset transcripts on validation @@ -228,7 +228,7 @@ class CreatePanelForm(FlaskForm): transcript = [] # Filled in validate function core_genes = [] - def validate(self): + def validate(self, extra_validators=[]): """Additional validation, used to validate gene list and panel name.""" # Reset on validation @@ -281,7 +281,7 @@ class PanelNewVersionForm(FlaskForm): transcript = [] core_genes = [] - def validate(self): + def validate(self, extra_validators=[]): """Additional validation, used to validate gene list.""" # Reset before validation @@ -333,7 +333,7 @@ class PanelVersionEditForm(FlaskForm): core_genes = [] - def validate(self): + def validate(self, extra_validators=[]): """Extra validation, used to validate core gene list.""" self.core_genes = [] # Reset core_genes on validation @@ -371,7 +371,7 @@ class SampleSetPanelGeneForm(FlaskForm): gene_id = '' - def validate(self): + def validate(self, extra_validators=[]): """Extra validation to parse panel / gene selection""" self.gene_id = '' valid_form = True @@ -404,7 +404,7 @@ class SampleGeneForm(FlaskForm): gene = StringField('Gene', validators=[validators.InputRequired()]) gene_id = '' - def validate(self): + def validate(self, extra_validators=[]): """Extra validation to parse panel / gene selection""" self.gene_id = '' valid_form = True diff --git a/ExonCov/models.py b/ExonCov/models.py index 625c2e7..0bdee4b 100644 --- a/ExonCov/models.py +++ b/ExonCov/models.py @@ -411,6 +411,8 @@ class User(db.Model, UserMixin): last_name = db.Column(db.String(255), nullable=False) active = db.Column(db.Boolean(), index=True, nullable=False) + fs_uniquifier = db.Column(db.String(255), unique=True, nullable=False) + roles = db.relationship('Role', secondary=roles_users, lazy='joined', backref=db.backref('users')) def __init__(self, email, password, first_name, last_name, active, roles): diff --git a/migrations/versions/743f453ff85b_add_fs_uniquifier.py b/migrations/versions/743f453ff85b_add_fs_uniquifier.py new file mode 100644 index 0000000..4b36ac1 --- /dev/null +++ b/migrations/versions/743f453ff85b_add_fs_uniquifier.py @@ -0,0 +1,84 @@ +"""Add fs_uniquifier + +Revision ID: 743f453ff85b +Revises: bcd8bff69995 +Create Date: 2024-03-11 23:47:09.778701 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import mysql + +# revision identifiers, used by Alembic. +revision = '743f453ff85b' +down_revision = 'bcd8bff69995' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + # with op.batch_alter_table('event_logs', schema=None) as batch_op: + # batch_op.create_index(batch_op.f('ix_event_logs_user_id'), ['user_id'], unique=False) + + # with op.batch_alter_table('exons', schema=None) as batch_op: + # batch_op.drop_index('ix_exons_chr') + + # with op.batch_alter_table('sample_projects', schema=None) as batch_op: + # batch_op.alter_column('name', + # existing_type=mysql.VARCHAR(length=255), + # type_=sa.String(length=50), + # existing_nullable=False) + # batch_op.alter_column('type', + # existing_type=mysql.VARCHAR(length=255), + # nullable=True) + + # with op.batch_alter_table('sequencing_runs', schema=None) as batch_op: + # batch_op.alter_column('name', + # existing_type=mysql.VARCHAR(length=255), + # type_=sa.String(length=50), + # existing_nullable=True) + + # with op.batch_alter_table('transcript_measurements', schema=None) as batch_op: + # batch_op.create_unique_constraint(None, ['transcript_id', 'sample_id']) + # batch_op.create_foreign_key(None, 'transcripts', ['transcript_id'], ['id']) + + with op.batch_alter_table('user', schema=None) as batch_op: + batch_op.add_column(sa.Column('fs_uniquifier', sa.String(length=255), nullable=False)) + # batch_op.create_unique_constraint(None, ['fs_uniquifier']) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('user', schema=None) as batch_op: + # batch_op.drop_constraint(None, type_='unique') + batch_op.drop_column('fs_uniquifier') + + # with op.batch_alter_table('transcript_measurements', schema=None) as batch_op: + # batch_op.drop_constraint(None, type_='foreignkey') + # batch_op.drop_constraint(None, type_='unique') + + # with op.batch_alter_table('sequencing_runs', schema=None) as batch_op: + # batch_op.alter_column('name', + # existing_type=sa.String(length=50), + # type_=mysql.VARCHAR(length=255), + # existing_nullable=True) + + # with op.batch_alter_table('sample_projects', schema=None) as batch_op: + # batch_op.alter_column('type', + # existing_type=mysql.VARCHAR(length=255), + # nullable=False) + # batch_op.alter_column('name', + # existing_type=sa.String(length=50), + # type_=mysql.VARCHAR(length=255), + # existing_nullable=False) + + # with op.batch_alter_table('exons', schema=None) as batch_op: + # batch_op.create_index('ix_exons_chr', ['chr'], unique=False) + + # with op.batch_alter_table('event_logs', schema=None) as batch_op: + # batch_op.drop_index(batch_op.f('ix_event_logs_user_id')) + + # ### end Alembic commands ### diff --git a/requirements.txt b/requirements.txt index a7b0ad7..529b734 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,12 @@ -Flask-Admin==1.5.3 -Flask-DebugToolbar==0.10.1 -Flask-Login==0.4.1 -Flask-Migrate==2.5.2 -Flask-SQLAlchemy==2.4.4 -Flask-Script==2.0.6 -Flask-Security==3.0.0 -Flask-WTF==0.14.2 -Flask==1.0.2 -SQLAlchemy==1.3.5 -backports.tempfile==1.0 -email_validator==1.1.3 -future==0.18.2 -mysql-connector-python==8.0.29 -pysam==0.15.2 -werkzeug==0.16.1 -WTForms==2.1 +email_validator==2.1.1 +Flask==2.2.5 +Flask-Admin==1.6.1 +Flask-DebugToolbar==0.14.1 +Flask-Migrate==4.0.7 +Flask-SQLAlchemy==3.1.1 +Flask-WTF==1.2.1 +mysql-connector-python==8.3.0 +pysam==0.22.0 +pytz==2024.1 +WTForms==2.3.3 +Flask-Security-Too==4.1.6 \ No newline at end of file From a307716874df275899235021132de8e709b309a4 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 14 Mar 2024 14:09:39 +0100 Subject: [PATCH 08/13] Move cli to Click module used by Flask --- ExonCov.py | 31 - ExonCov/__init__.py | 8 +- ExonCov/cli.py | 1577 +++++++++++++++++++++---------------------- README.md | 18 +- 4 files changed, 793 insertions(+), 841 deletions(-) delete mode 100755 ExonCov.py diff --git a/ExonCov.py b/ExonCov.py deleted file mode 100755 index 8f08595..0000000 --- a/ExonCov.py +++ /dev/null @@ -1,31 +0,0 @@ -#!venv/bin/python -"""ExonCov CLI.""" -from flask_script import Manager -from flask_migrate import Migrate, MigrateCommand - -from ExonCov import app, db, cli - -manager = Manager(app) -# migrate = Migrate(app, db) -db_manager = Manager(usage='Database commands.') - -manager.add_command("db", db_manager) -manager.add_command('import_bam', cli.ImportBam()) -manager.add_command('search_sample', cli.SearchSample()) -manager.add_command('sample_qc', cli.SampleQC()) -manager.add_command('remove_sample', cli.RemoveSample()) -manager.add_command('check_samples', cli.CheckSamples()) -manager.add_command('create_sample_set', cli.CreateSampleSet()) - -db_manager.add_command('migrate', MigrateCommand) -db_manager.add_command('stats', cli.PrintStats()) -db_manager.add_command('panel_genes', cli.PrintPanelGenesTable()) -db_manager.add_command('coverage_stats', cli.ExportCovStatsSampleSet()) -db_manager.add_command('gene_transcripts', cli.PrintTranscripts()) -db_manager.add_command('import_alias_table', cli.ImportAliasTable()) -db_manager.add_command('export_alias_table', cli.ExportAliasTable()) -db_manager.add_command('export_panel_bed', cli.PrintPanelBed()) -# db_manager.add_command('load_design', cli.LoadDesign()) # Disabled, can be used to setup a new database from scratch - -if __name__ == "__main__": - manager.run() diff --git a/ExonCov/__init__.py b/ExonCov/__init__.py index 85670c8..ab6fa72 100644 --- a/ExonCov/__init__.py +++ b/ExonCov/__init__.py @@ -9,13 +9,14 @@ from flask_security import Security, SQLAlchemyUserDatastore from flask_debugtoolbar import DebugToolbarExtension from flask_migrate import Migrate + from ExonCov.utils import url_for_other_page, event_logger # Setup APP app = Flask(__name__) app.config.from_object('config') db = SQLAlchemy(app) -migrate = Migrate(app, db) +migrate = Migrate(app, db, command='db_migrate') csrf = CSRFProtect(app) admin = flask_admin.Admin(app, name='ExonCov Admin', template_mode='bootstrap3') @@ -28,7 +29,10 @@ app.jinja_env.globals['git_version'] = check_output(git_command + ['describe', '--tags']).decode('ascii').strip() app.jinja_env.globals['git_commit'] = check_output(git_command + ['rev-parse', 'HEAD']).decode('ascii').strip() -from . import views, api_views, admin_views, models, forms +from . import views, api_views, admin_views, models, forms, cli + +# Setup CLI +app.cli.add_command(cli.db_cli) # Setup flask_security user_datastore = SQLAlchemyUserDatastore(db, models.User, models.Role) diff --git a/ExonCov/cli.py b/ExonCov/cli.py index 795b856..c19fc5c 100644 --- a/ExonCov/cli.py +++ b/ExonCov/cli.py @@ -5,10 +5,11 @@ import time from subprocess import run as subprocess_run, PIPE, Popen, CalledProcessError import shlex -import urllib +import urllib.request import datetime -from flask_script import Command, Option +import click +from flask.cli import AppGroup from sqlalchemy.orm import joinedload from sqlalchemy.exc import IntegrityError from sqlalchemy.orm.exc import NoResultFound @@ -23,420 +24,332 @@ PanelVersion, panels_transcripts, CustomPanel, SampleSet ) +db_cli = AppGroup('db', help="Database commands.") -class PrintStats(Command): - """Print database stats.""" - def run(self): - print("Number of genes: {0}".format(Gene.query.count())) - print("Number of transcripts: {0}".format(Transcript.query.count())) - print("Number of exons: {0}".format(Exon.query.count())) - print("Number of panels: {0}".format(Panel.query.count())) - print("Number of custom panels: {0}".format(CustomPanel.query.count())) - print("Number of samples: {0}".format(Sample.query.count())) - print("Number of sequencing runs: {0}".format(SequencingRun.query.count())) - print("Number of sequencing projects: {0}".format(SampleProject.query.count())) - - print("Number of projects and samples per year:") - projects_year = {} - for project in SampleProject.query.options(joinedload('samples')): - if project.name[0:6].isdigit() and project.samples: - project_year = project.name[0:2] - if project_year not in projects_year: - projects_year[project_year] = [0, 0] - projects_year[project_year][0] += 1 - projects_year[project_year][1] += len(project.samples) - - print("Year\tProjects\tSamples") - for year, count in sorted(projects_year.items()): - print("{year}\t{project_count}\t{sample_count}".format(year=year, project_count=count[0], sample_count=count[1])) - - -class PrintPanelGenesTable(Command): +@db_cli.command('stats') +def print_stats(): + """Print database stats.""" + print("Number of genes: {0}".format(Gene.query.count())) + print("Number of transcripts: {0}".format(Transcript.query.count())) + print("Number of exons: {0}".format(Exon.query.count())) + print("Number of panels: {0}".format(Panel.query.count())) + print("Number of custom panels: {0}".format(CustomPanel.query.count())) + print("Number of samples: {0}".format(Sample.query.count())) + print("Number of sequencing runs: {0}".format(SequencingRun.query.count())) + print("Number of sequencing projects: {0}".format(SampleProject.query.count())) + + print("Number of projects and samples per year:") + projects_year = {} + for project in SampleProject.query.options(joinedload(SampleProject.samples)): + if project.name[0:6].isdigit() and project.samples: + project_year = project.name[0:2] + if project_year not in projects_year: + projects_year[project_year] = [0, 0] + projects_year[project_year][0] += 1 + projects_year[project_year][1] += len(project.samples) + + print("Year\tProjects\tSamples") + for year, count in sorted(projects_year.items()): + print("{year}\t{project_count}\t{sample_count}".format(year=year, project_count=count[0], sample_count=count[1])) + + +@db_cli.command('panel_genes') +@click.option('--archived_panels', '-a', default=True, is_flag=True) +def print_panel_genes_table(archived_panels): """Print tab delimited panel / genes table.""" - option_list = ( - Option( - '-a', '--archived_panels', dest='active_panels', default=True, - action='store_false', help="Use archived panels instead of active panels" - ), - ) + print('{panel}\t{release_date}\t{genes}'.format(panel='panel_version', release_date='release_date', genes='genes')) - def run(self, active_panels): - print('{panel}\t{release_date}\t{genes}'.format(panel='panel_version', release_date='release_date', genes='genes')) + panel_versions = PanelVersion.query.filter_by(active=archived_panels).options(joinedload(PanelVersion.transcripts)) - panel_versions = PanelVersion.query.filter_by(active=active_panels).options(joinedload('transcripts')) - - for panel in panel_versions: - print('{panel}\t{release_date}\t{genes}'.format( - panel=panel.name_version, - release_date=panel.release_date, - genes='\t'.join([transcript.gene_id for transcript in panel.transcripts]) - )) + for panel in panel_versions: + print('{panel}\t{release_date}\t{genes}'.format( + panel=panel.name_version, + release_date=panel.release_date, + genes='\t'.join([transcript.gene_id for transcript in panel.transcripts]) + )) -class PrintTranscripts(Command): +@db_cli.command('gene_transcripts') +@click.option('--preferred_transcripts', '-p', is_flag=True) +def print_transcripts(preferred_transcripts): """Print tab delimited gene / transcript table""" + print('{gene}\t{transcript}'.format(gene='Gene', transcript='Transcript')) - option_list = ( - Option('-p', '--preferred_transcripts', dest='preferred_transcripts', default=False, action='store_true', help="Print preferred transcripts only"), - ) - - def run(self, preferred_transcripts): - print('{gene}\t{transcript}'.format(gene='Gene', transcript='Transcript')) - - genes = Gene.query.options(joinedload('transcripts')) + genes = Gene.query.options(joinedload(Gene.transcripts)) - for gene in genes: - if preferred_transcripts: - print('{gene}\t{transcript}'.format(gene=gene.id, transcript=gene.default_transcript.name)) - else: - for transcript in gene.transcripts: - print('{gene}\t{transcript}'.format(gene=gene.id, transcript=transcript.name)) - - -class ImportBam(Command): + for gene in genes: + if preferred_transcripts: + print('{gene}\t{transcript}'.format(gene=gene.id, transcript=gene.default_transcript.name)) + else: + for transcript in gene.transcripts: + print('{gene}\t{transcript}'.format(gene=gene.id, transcript=transcript.name)) + + +@app.cli.command('import_bam') +@click.argument('project_name') +@click.argument('sample_type', type=click.Choice(['WES', 'WGS', 'RNA'])) +@click.argument('bam') +@click.option('-b', '--exon_bed', 'exon_bed_file', default=app.config['EXON_BED_FILE']) +@click.option('-t', '--threads', default=1) +@click.option('-f', '--overwrite', is_flag=True) +@click.option('--print_output', is_flag=True) +@click.option('--print_sample_id', is_flag=True) +@click.option('--temp', 'temp_path', default=None) +def import_bam(project_name, sample_type, bam, exon_bed_file, threads, overwrite, print_output, print_sample_id, temp_path): """Import sample from bam file.""" - - option_list = ( - Option('project_name'), - Option('sample_type', choices=['WES', 'WGS', 'RNA']), - Option('bam'), - Option('-b', '--exon_bed', dest='exon_bed_file', default=app.config['EXON_BED_FILE']), - Option('-t', '--threads', dest='threads', default=1), - Option('-f', '--overwrite', dest='overwrite', default=False, action='store_true'), - Option('--print_output', dest='print_output', default=False, action='store_true'), - Option('--print_sample_id', dest='print_sample_id', default=False, action='store_true'), - Option('--temp', dest='temp_path', default=None), + try: + bam_file = pysam.AlignmentFile(bam, "rb") + except IOError as e: + sys.exit(e) + + sample_name = None + sequencing_runs = {} + sequencing_run_ids = [] + exon_measurements = {} + + # Parse read groups and setup sequencing run(s) + for read_group in bam_file.header['RG']: + if not sample_name: + sample_name = read_group['SM'] + elif sample_name != read_group['SM']: + sys.exit("ERROR: Exoncov does not support bam files containing multiple samples.") + + if read_group['PU'] not in sequencing_runs: + sequencing_run, sequencing_run_exists = utils.get_one_or_create( + db.session, + SequencingRun, + platform_unit=read_group['PU'] + ) # returns object and exists bool + sequencing_runs[read_group['PU']] = sequencing_run + sequencing_run_ids.append(sequencing_run.id) + + if not sequencing_runs: + sys.exit("ERROR: Sample can not be linked to a sequencing run, please make sure that read groups contain PU values.") + + bam_file.close() + + # Setup sample project + sample_project, sample_project_exists = utils.get_one_or_create( + db.session, + SampleProject, + name=project_name, + type='' + ) # returns object and exists bool + + # Look for sample in database + sample = Sample.query.filter_by(name=sample_name).filter_by(project_id=sample_project.id).first() + if sample and overwrite: + db.session.delete(sample) + db.session.commit() + elif sample and not overwrite: + sys.exit("ERROR: Sample and project combination already exists.\t{0}".format(sample)) + + # Create sambamba command + sambamba_command = "{sambamba} depth region {bam_file} --nthreads {threads} --filter '{filter}' --regions {bed_file} {settings}".format( + sambamba=app.config['SAMBAMBA'], + bam_file=bam, + threads=threads, + filter=app.config['SAMBAMBA_FILTER'], + bed_file=exon_bed_file, + settings='--fix-mate-overlaps --min-base-quality 10 --cov-threshold 10 --cov-threshold 15 --cov-threshold 20 --cov-threshold 30 --cov-threshold 50 --cov-threshold 100', ) - def run(self, project_name, sample_type, bam, exon_bed_file, threads, overwrite, print_output, print_sample_id, temp_path): - try: - bam_file = pysam.AlignmentFile(bam, "rb") - except IOError as e: - sys.exit(e) - - sample_name = None - sequencing_runs = {} - sequencing_run_ids = [] - exon_measurements = {} - - # Parse read groups and setup sequencing run(s) - for read_group in bam_file.header['RG']: - if not sample_name: - sample_name = read_group['SM'] - elif sample_name != read_group['SM']: - sys.exit("ERROR: Exoncov does not support bam files containing multiple samples.") - - if read_group['PU'] not in sequencing_runs: - sequencing_run, sequencing_run_exists = utils.get_one_or_create( - db.session, - SequencingRun, - platform_unit=read_group['PU'] - ) # returns object and exists bool - sequencing_runs[read_group['PU']] = sequencing_run - sequencing_run_ids.append(sequencing_run.id) - - if not sequencing_runs: - sys.exit("ERROR: Sample can not be linked to a sequencing run, please make sure that read groups contain PU values.") - - bam_file.close() - - # Setup sample project - sample_project, sample_project_exists = utils.get_one_or_create( - db.session, - SampleProject, - name=project_name, - type='' - ) # returns object and exists bool - - # Look for sample in database - sample = Sample.query.filter_by(name=sample_name).filter_by(project_id=sample_project.id).first() - if sample and overwrite: - db.session.delete(sample) - db.session.commit() - elif sample and not overwrite: - sys.exit("ERROR: Sample and project combination already exists.\t{0}".format(sample)) - - # Create sambamba command - sambamba_command = "{sambamba} depth region {bam_file} --nthreads {threads} --filter '{filter}' --regions {bed_file} {settings}".format( - sambamba=app.config['SAMBAMBA'], - bam_file=bam, - threads=threads, - filter=app.config['SAMBAMBA_FILTER'], - bed_file=exon_bed_file, - settings='--fix-mate-overlaps --min-base-quality 10 --cov-threshold 10 --cov-threshold 15 --cov-threshold 20 --cov-threshold 30 --cov-threshold 50 --cov-threshold 100', + # Create sample + sample = Sample( + name=sample_name, + project=sample_project, + type=sample_type, + file_name=bam, + import_command=sambamba_command, + sequencing_runs=list(sequencing_runs.values()), + exon_measurement_file='{0}_{1}'.format(sample_project.name, sample_name) ) - - # Create sample - sample = Sample( - name=sample_name, - project=sample_project, - type=sample_type, - file_name=bam, - import_command=sambamba_command, - sequencing_runs=list(sequencing_runs.values()), - exon_measurement_file='{0}_{1}'.format(sample_project.name, sample_name) - ) - db.session.add(sample) - db.session.commit() - - # Add sample id to exon measurement file - sample.exon_measurement_file = '{0}_{1}.txt'.format(sample.id, sample.exon_measurement_file) - db.session.add(sample) - db.session.commit() - # Store sample_id and close session before starting Sambamba - sample_id = sample.id - db.session.close() - - # Create temp_dir - if not temp_path: - temp_dir = tempfile.mkdtemp() - else: # assume path exist and user is responsible for this directory - temp_dir = temp_path - - # Run sambamba - p = Popen(shlex.split(sambamba_command), stdout=PIPE, encoding='utf-8') - exon_measurement_file_path = '{0}/{1}'.format(temp_dir, sample.exon_measurement_file) - with open(exon_measurement_file_path, "w") as exon_measurement_file: - for line in p.stdout: - if print_output: - print(line) - - # Header - if line.startswith('#'): - header = line.rstrip().split('\t') - measurement_mean_coverage_index = header.index('meanCoverage') - measurement_percentage10_index = header.index('percentage10') - measurement_percentage15_index = header.index('percentage15') - measurement_percentage20_index = header.index('percentage20') - measurement_percentage30_index = header.index('percentage30') - measurement_percentage50_index = header.index('percentage50') - measurement_percentage100_index = header.index('percentage100') - exon_measurement_file.write( - '#{chr}\t{start}\t{end}\t{cov}\t{perc_10}\t{perc_15}\t{perc_20}\t{perc_30}\t{perc_50}\t{perc_100}\n'.format( - chr='chr', - start='start', - end='end', - cov='measurement_mean_coverage', - perc_10='measurement_percentage10', - perc_15='measurement_percentage15', - perc_20='measurement_percentage20', - perc_30='measurement_percentage30', - perc_50='measurement_percentage50', - perc_100='measurement_percentage100', - ) + db.session.add(sample) + db.session.commit() + + # Add sample id to exon measurement file + sample.exon_measurement_file = '{0}_{1}.txt'.format(sample.id, sample.exon_measurement_file) + db.session.add(sample) + db.session.commit() + # Store sample_id and close session before starting Sambamba + sample_id = sample.id + db.session.close() + + # Create temp_dir + if not temp_path: + temp_dir = tempfile.mkdtemp() + else: # assume path exist and user is responsible for this directory + temp_dir = temp_path + + # Run sambamba + p = Popen(shlex.split(sambamba_command), stdout=PIPE, encoding='utf-8') + exon_measurement_file_path = '{0}/{1}'.format(temp_dir, sample.exon_measurement_file) + with open(exon_measurement_file_path, "w") as exon_measurement_file: + for line in p.stdout: + if print_output: + print(line) + + # Header + if line.startswith('#'): + header = line.rstrip().split('\t') + measurement_mean_coverage_index = header.index('meanCoverage') + measurement_percentage10_index = header.index('percentage10') + measurement_percentage15_index = header.index('percentage15') + measurement_percentage20_index = header.index('percentage20') + measurement_percentage30_index = header.index('percentage30') + measurement_percentage50_index = header.index('percentage50') + measurement_percentage100_index = header.index('percentage100') + exon_measurement_file.write( + '#{chr}\t{start}\t{end}\t{cov}\t{perc_10}\t{perc_15}\t{perc_20}\t{perc_30}\t{perc_50}\t{perc_100}\n'.format( + chr='chr', + start='start', + end='end', + cov='measurement_mean_coverage', + perc_10='measurement_percentage10', + perc_15='measurement_percentage15', + perc_20='measurement_percentage20', + perc_30='measurement_percentage30', + perc_50='measurement_percentage50', + perc_100='measurement_percentage100', ) + ) - # Measurement - else: - data = line.rstrip().split('\t') - chr, start, end = data[:3] - exon_id = '{0}_{1}_{2}'.format(chr, start, end) - measurement_mean_coverage = float(data[measurement_mean_coverage_index]) - measurement_percentage10 = float(data[measurement_percentage10_index]) - measurement_percentage15 = float(data[measurement_percentage15_index]) - measurement_percentage20 = float(data[measurement_percentage20_index]) - measurement_percentage30 = float(data[measurement_percentage30_index]) - measurement_percentage50 = float(data[measurement_percentage50_index]) - measurement_percentage100 = float(data[measurement_percentage100_index]) - exon_measurement_file.write( - '{chr}\t{start}\t{end}\t{cov}\t{perc_10}\t{perc_15}\t{perc_20}\t{perc_30}\t{perc_50}\t{perc_100}\n'.format( - chr=chr, - start=start, - end=end, - cov=measurement_mean_coverage, - perc_10=measurement_percentage10, - perc_15=measurement_percentage15, - perc_20=measurement_percentage20, - perc_30=measurement_percentage30, - perc_50=measurement_percentage50, - perc_100=measurement_percentage100, - ) + # Measurement + else: + data = line.rstrip().split('\t') + chr, start, end = data[:3] + exon_id = '{0}_{1}_{2}'.format(chr, start, end) + measurement_mean_coverage = float(data[measurement_mean_coverage_index]) + measurement_percentage10 = float(data[measurement_percentage10_index]) + measurement_percentage15 = float(data[measurement_percentage15_index]) + measurement_percentage20 = float(data[measurement_percentage20_index]) + measurement_percentage30 = float(data[measurement_percentage30_index]) + measurement_percentage50 = float(data[measurement_percentage50_index]) + measurement_percentage100 = float(data[measurement_percentage100_index]) + exon_measurement_file.write( + '{chr}\t{start}\t{end}\t{cov}\t{perc_10}\t{perc_15}\t{perc_20}\t{perc_30}\t{perc_50}\t{perc_100}\n'.format( + chr=chr, + start=start, + end=end, + cov=measurement_mean_coverage, + perc_10=measurement_percentage10, + perc_15=measurement_percentage15, + perc_20=measurement_percentage20, + perc_30=measurement_percentage30, + perc_50=measurement_percentage50, + perc_100=measurement_percentage100, ) + ) - exon_measurements[exon_id] = { - 'sample_id': sample.id, - 'exon_id': exon_id, - 'measurement_mean_coverage': measurement_mean_coverage, - 'measurement_percentage10': measurement_percentage10, - 'measurement_percentage15': measurement_percentage15, - 'measurement_percentage20': measurement_percentage20, - 'measurement_percentage30': measurement_percentage30, - 'measurement_percentage50': measurement_percentage50, - 'measurement_percentage100': measurement_percentage100, - } - - # Set transcript measurements - transcripts = Transcript.query.options(joinedload('exons')).all() - transcripts_measurements = {} - - for transcript in transcripts: - for exon in transcript.exons: - exon_measurement = exon_measurements[exon.id] - if transcript.id not in transcripts_measurements: - transcripts_measurements[transcript.id] = { - 'len': exon.len, - 'transcript_id': transcript.id, - 'sample_id': sample.id, - 'measurement_mean_coverage': exon_measurement['measurement_mean_coverage'], - 'measurement_percentage10': exon_measurement['measurement_percentage10'], - 'measurement_percentage15': exon_measurement['measurement_percentage15'], - 'measurement_percentage20': exon_measurement['measurement_percentage20'], - 'measurement_percentage30': exon_measurement['measurement_percentage30'], - 'measurement_percentage50': exon_measurement['measurement_percentage50'], - 'measurement_percentage100': exon_measurement['measurement_percentage100'], - } - else: - measurement_types = ['measurement_mean_coverage', 'measurement_percentage10', 'measurement_percentage15', 'measurement_percentage20', 'measurement_percentage30', 'measurement_percentage50', 'measurement_percentage100'] - for measurement_type in measurement_types: - transcripts_measurements[transcript.id][measurement_type] = utils.weighted_average( - values=[transcripts_measurements[transcript.id][measurement_type], exon_measurement[measurement_type]], - weights=[transcripts_measurements[transcript.id]['len'], exon.len] - ) - transcripts_measurements[transcript.id]['len'] += exon.len - - # Bulk insert transcript measurements - bulk_insert_n = 1000 - transcript_values = list(transcripts_measurements.values()) - for i in range(0, len(transcript_values), bulk_insert_n): - db.session.bulk_insert_mappings(TranscriptMeasurement, transcript_values[i:i+bulk_insert_n]) - db.session.commit() - - # Compress, index and rsync exon_measurements - exon_measurement_file_path_gz = '{0}.gz'.format(exon_measurement_file_path) - pysam.tabix_compress(exon_measurement_file_path, exon_measurement_file_path_gz) - pysam.tabix_index(exon_measurement_file_path_gz, seq_col=0, start_col=1, end_col=2) + exon_measurements[exon_id] = { + 'sample_id': sample.id, + 'exon_id': exon_id, + 'measurement_mean_coverage': measurement_mean_coverage, + 'measurement_percentage10': measurement_percentage10, + 'measurement_percentage15': measurement_percentage15, + 'measurement_percentage20': measurement_percentage20, + 'measurement_percentage30': measurement_percentage30, + 'measurement_percentage50': measurement_percentage50, + 'measurement_percentage100': measurement_percentage100, + } - # External subprocess - command_result = subprocess_run( - f"rsync {exon_measurement_file_path_gz}* {app.config['EXON_MEASUREMENTS_RSYNC_PATH']}", shell=True, stdout=PIPE - ) - # Check returncode and raise CalledProcessError if non-zero. - try: - command_result.check_returncode() - except CalledProcessError: - sys.exit(f'ERROR: Rsync unsuccesful, returncode {command_result.returncode}.') + # Set transcript measurements + transcripts = Transcript.query.options(joinedload('exons')).all() + transcripts_measurements = {} + + for transcript in transcripts: + for exon in transcript.exons: + exon_measurement = exon_measurements[exon.id] + if transcript.id not in transcripts_measurements: + transcripts_measurements[transcript.id] = { + 'len': exon.len, + 'transcript_id': transcript.id, + 'sample_id': sample.id, + 'measurement_mean_coverage': exon_measurement['measurement_mean_coverage'], + 'measurement_percentage10': exon_measurement['measurement_percentage10'], + 'measurement_percentage15': exon_measurement['measurement_percentage15'], + 'measurement_percentage20': exon_measurement['measurement_percentage20'], + 'measurement_percentage30': exon_measurement['measurement_percentage30'], + 'measurement_percentage50': exon_measurement['measurement_percentage50'], + 'measurement_percentage100': exon_measurement['measurement_percentage100'], + } + else: + measurement_types = ['measurement_mean_coverage', 'measurement_percentage10', 'measurement_percentage15', 'measurement_percentage20', 'measurement_percentage30', 'measurement_percentage50', 'measurement_percentage100'] + for measurement_type in measurement_types: + transcripts_measurements[transcript.id][measurement_type] = utils.weighted_average( + values=[transcripts_measurements[transcript.id][measurement_type], exon_measurement[measurement_type]], + weights=[transcripts_measurements[transcript.id]['len'], exon.len] + ) + transcripts_measurements[transcript.id]['len'] += exon.len - # Change exon_measurement_file to path on server. - sample = Sample.query.get(sample_id) - sample.exon_measurement_file = '{0}/{1}.gz'.format( - app.config['EXON_MEASUREMENTS_RSYNC_PATH'].split(':')[-1], - sample.exon_measurement_file - ) - db.session.add(sample) + # Bulk insert transcript measurements + bulk_insert_n = 1000 + transcript_values = list(transcripts_measurements.values()) + for i in range(0, len(transcript_values), bulk_insert_n): + db.session.bulk_insert_mappings(TranscriptMeasurement, transcript_values[i:i+bulk_insert_n]) db.session.commit() - # Remove temp_dir - if not temp_path: - shutil.rmtree(temp_dir) + # Compress, index and rsync exon_measurements + exon_measurement_file_path_gz = '{0}.gz'.format(exon_measurement_file_path) + pysam.tabix_compress(exon_measurement_file_path, exon_measurement_file_path_gz) + pysam.tabix_index(exon_measurement_file_path_gz, seq_col=0, start_col=1, end_col=2) - # Return sample id - if print_sample_id: - print(sample.id) - - -class SearchSample(Command): - """Search sample in database.""" - - option_list = ( - Option('sample_name'), + # External subprocess + command_result = subprocess_run( + f"rsync {exon_measurement_file_path_gz}* {app.config['EXON_MEASUREMENTS_RSYNC_PATH']}", shell=True, stdout=PIPE ) - - def run(self, sample_name): - samples = Sample.query.filter(Sample.name.like('%{0}%'.format(sample_name))).all() - - print("Sample ID\tSample Name\tProject\tSequencing Runs\tCustom Panels") - for sample in samples: - print("{id}\t{name}\t{project}\t{runs}\t{custom_panels}".format( - id=sample.id, - name=sample.name, - project=sample.project, - runs=sample.sequencing_runs, - custom_panels=sample.custom_panels, - )) - - -class SampleQC(Command): - """Perform sample QC for a given panel (15X)""" - - option_list = ( - Option('-s', '--samples', nargs='+'), - Option('-p', '--panels', nargs='+'), - Option('-a', '--archived_panels', dest='active_panels', default=True, - action='store_false', help="Use archived panels instead of active panels"), + # Check returncode and raise CalledProcessError if non-zero. + try: + command_result.check_returncode() + except CalledProcessError: + sys.exit(f'ERROR: Rsync unsuccesful, returncode {command_result.returncode}.') + + # Change exon_measurement_file to path on server. + sample = Sample.query.get(sample_id) + sample.exon_measurement_file = '{0}/{1}.gz'.format( + app.config['EXON_MEASUREMENTS_RSYNC_PATH'].split(':')[-1], + sample.exon_measurement_file ) + db.session.add(sample) + db.session.commit() + + # Remove temp_dir + if not temp_path: + shutil.rmtree(temp_dir) + + # Return sample id + if print_sample_id: + print(sample.id) + + +@app.cli.command('search_sample') +@click.argument('sample_name') +def search_sample(sample_name): + samples = Sample.query.filter(Sample.name.like('%{0}%'.format(sample_name))).all() + + print("Sample ID\tSample Name\tProject\tSequencing Runs\tCustom Panels") + for sample in samples: + print("{id}\t{name}\t{project}\t{runs}\t{custom_panels}".format( + id=sample.id, + name=sample.name, + project=sample.project, + runs=sample.sequencing_runs, + custom_panels=sample.custom_panels, + )) - def run(self, samples, panels, active_panels): - # Check equal number of samples and panels - if len(samples) != len(panels): - sys.exit('Number of samples and number of panels must be exactly the same.') - - # Header - print("Sample\tPanel\tPanel min. 15x\tPanel 15x\tPanel QC passed\tCore Gene QC passed") - - for index, sample_id in enumerate(samples): - # Query database - sample = Sample.query.get(sample_id) - panel = ( - PanelVersion.query - .filter_by(panel_name=panels[index]) - .filter_by(active=active_panels) - .filter_by(validated=True) - .order_by(PanelVersion.id.desc()) - .first() - ) - - if sample and panel: - transcript_measurements = ( - db.session.query(Transcript, TranscriptMeasurement) - .join(panels_transcripts) - .filter(panels_transcripts.columns.panel_id == panel.id) - .join(TranscriptMeasurement) - .filter_by(sample_id=sample.id) - .options(joinedload(Transcript.exons, innerjoin=True)) - .options(joinedload(Transcript.gene)) - .order_by(TranscriptMeasurement.measurement_percentage15.asc()) - .all() - ) - # Calculate average panel 15X coverage and compare with coverage_requirement_15 - panel_qc = False - panel_measurement_percentage15_avg = utils.weighted_average( - values=[tm[1].measurement_percentage15 for tm in transcript_measurements], - weights=[tm[1].len for tm in transcript_measurements] - ) - if panel_measurement_percentage15_avg >= panel.coverage_requirement_15: - panel_qc = True - - # Check gene 15X coverage for core genes - core_gene_qc = True - for transcript, transcript_measurement in transcript_measurements: - if transcript.gene in panel.core_genes and transcript_measurement.measurement_percentage15 != 100: - core_gene_qc = False - - self.print_result( - sample=sample.name, - panel=panel, - panel_min_15x=panel.coverage_requirement_15, - panel_15x=panel_measurement_percentage15_avg, - panel_qc=panel_qc, - core_gene_qc=core_gene_qc - ) - else: # sample and/or panel not found - sample_msg = 'unknown_sample={0}'.format(sample_id) - panel_msg = 'unknown_panel={0}'.format(panels[index]) - - if sample: - sample_msg = sample.name - if panel: - panel_msg = panel - - self.print_result( - sample=sample_msg, - panel=panel_msg, - ) +@app.cli.command('sample_qc') +@click.option('-s', '--samples', multiple=True) +@click.option('-p', '--panels', multiple=True) +@click.option('-a', '--archived_panels', 'active_panels', default=True, is_flag=True) +def sample_qc(samples, panels, active_panels): + """Perform sample QC for a given panel (15X)""" + # Check equal number of samples and panels + if len(samples) != len(panels): + sys.exit('Number of samples and number of panels must be exactly the same.') - def print_result(self, sample, panel, panel_min_15x='', panel_15x='', panel_qc='', core_gene_qc=''): + def print_result(sample, panel, panel_min_15x='', panel_15x='', panel_qc='', core_gene_qc=''): print("{sample}\t{panel}\t{panel_min_15x}\t{panel_15x}\t{panel_qc}\t{core_gene_qc}".format( sample=sample, panel=panel, @@ -446,446 +359,320 @@ def print_result(self, sample, panel, panel_min_15x='', panel_15x='', panel_qc=' core_gene_qc=core_gene_qc )) + # Header + print("Sample\tPanel\tPanel min. 15x\tPanel 15x\tPanel QC passed\tCore Gene QC passed") -class RemoveSample(Command): - """Remove sample from database.""" - - option_list = ( - Option('sample_id'), - ) - - def run(self, sample_id): + for index, sample_id in enumerate(samples): + # Query database sample = Sample.query.get(sample_id) - if not sample: - sys.exit("ERROR: Sample not found in the database.") - - elif sample.custom_panels: - sys.exit("ERROR: Sample is used in custom panels.") + panel = ( + PanelVersion.query + .filter_by(panel_name=panels[index]) + .filter_by(active=active_panels) + .filter_by(validated=True) + .order_by(PanelVersion.id.desc()) + .first() + ) - else: - db.session.delete(sample) - db.session.commit() + if sample and panel: + transcript_measurements = ( + db.session.query(Transcript, TranscriptMeasurement) + .join(panels_transcripts) + .filter(panels_transcripts.columns.panel_id == panel.id) + .join(TranscriptMeasurement) + .filter_by(sample_id=sample.id) + .options(joinedload(Transcript.exons, innerjoin=True)) + .options(joinedload(Transcript.gene)) + .order_by(TranscriptMeasurement.measurement_percentage15.asc()) + .all() + ) + # Calculate average panel 15X coverage and compare with coverage_requirement_15 + panel_qc = False + panel_measurement_percentage15_avg = utils.weighted_average( + values=[tm[1].measurement_percentage15 for tm in transcript_measurements], + weights=[tm[1].len for tm in transcript_measurements] + ) + if panel_measurement_percentage15_avg >= panel.coverage_requirement_15: + panel_qc = True + + # Check gene 15X coverage for core genes + core_gene_qc = True + for transcript, transcript_measurement in transcript_measurements: + if transcript.gene in panel.core_genes and transcript_measurement.measurement_percentage15 != 100: + core_gene_qc = False + + print_result( + sample=sample.name, + panel=panel, + panel_min_15x=panel.coverage_requirement_15, + panel_15x=panel_measurement_percentage15_avg, + panel_qc=panel_qc, + core_gene_qc=core_gene_qc + ) + else: # sample and/or panel not found + sample_msg = 'unknown_sample={0}'.format(sample_id) + panel_msg = 'unknown_panel={0}'.format(panels[index]) + + if sample: + sample_msg = sample.name + if panel: + panel_msg = panel + + print_result( + sample=sample_msg, + panel=panel_msg, + ) -class CheckSamples(Command): - """Check transcripts_measurements for all samples.""" - def run(self): - error = False +@app.cli.command('remove_sample') +@click.argument('sample_id') +def remove_sample(sample_id): + """Remove sample from database.""" + sample = Sample.query.get(sample_id) + if not sample: + sys.exit("ERROR: Sample not found in the database.") - transcript_count = Transcript.query.count() - sample_transcript_count = db.session.query(Sample.name, func.count(TranscriptMeasurement.id)).outerjoin(TranscriptMeasurement).group_by(Sample.id).all() - for sample_count in sample_transcript_count: - if sample_count[1] != transcript_count: - print("ERROR: Sample:{0} TranscriptMeasurement:{1} Exons:{2}".format(sample_count[0], sample_count[1], transcript_count)) - error = True + elif sample.custom_panels: + sys.exit("ERROR: Sample is used in custom panels.") - if not error: - print("No errors found.") + else: + db.session.delete(sample) + db.session.commit() -class CreateSampleSet(Command): +@app.cli.command('check_samples') +def check_samples(): + """Check transcripts_measurements for all samples.""" + error = False + + transcript_count = Transcript.query.count() + sample_transcript_count = db.session.query(Sample.name, func.count(TranscriptMeasurement.id)).outerjoin(TranscriptMeasurement).group_by(Sample.id).all() + for sample_count in sample_transcript_count: + if sample_count[1] != transcript_count: + print("ERROR: Sample:{0} TranscriptMeasurement:{1} Exons:{2}".format(sample_count[0], sample_count[1], transcript_count)) + error = True + + if not error: + print("No errors found.") + + +@app.cli.command('create_sample_set') +@click.argument('name') +@click.option('-m', '--min_days', type=int, default=0) +@click.option('-d', '--max_days', type=int, default=180) +@click.option('-s', '--sample_filter', default='') +@click.option('-t', '--sample_type', default='WES') +@click.option('-n', '--sample_number', type=int, default=100) +def create_sample_set(name, min_days, max_days, sample_filter, sample_type, sample_number): """Create (random) sample set.""" - - option_list = ( - Option('name'), - Option('-m', '--min_days', dest='min_days', type=int, default=0), - Option('-d', '--max_days', dest='max_days', type=int, default=180), - Option('-s', '--sample_filter', dest='sample_filter', default=''), - Option('-t', '--sample_type', dest='sample_type', default='WES'), - Option('-n', '--sample_number', dest='sample_number', type=int, default=100), + description = '{0} random {1} samples. Minimum age: {2} days. Maximum age: {3} days. Sample name filter: {4}'.format( + sample_number, sample_type, min_days, max_days, sample_filter ) - - def run(self, name, min_days, max_days, sample_filter, sample_type, sample_number): - description = '{0} random {1} samples. Minimum age: {2} days. Maximum age: {3} days. Sample name filter: {4}'.format( - sample_number, sample_type, min_days, max_days, sample_filter - ) - min_date = datetime.date.today() - datetime.timedelta(days=min_days) - max_date = datetime.date.today() - datetime.timedelta(days=max_days) - - if max_date > min_date: - raise ValueError( - ( - "Incorect use of max_days ({max_days}) and/or min_days ({min_days}): " - "maximum date {max_date} is greater than min date {min_date}" - ).format(max_days=max_days, min_days=min_days, max_date=max_date, min_date=min_date) - ) - samples = ( - Sample.query - .filter(Sample.name.like('%{0}%'.format(sample_filter))) - .filter_by(type=sample_type) - .order_by(func.rand()) + min_date = datetime.date.today() - datetime.timedelta(days=min_days) + max_date = datetime.date.today() - datetime.timedelta(days=max_days) + + if max_date > min_date: + raise ValueError( + ( + "Incorect use of max_days ({max_days}) and/or min_days ({min_days}): " + "maximum date {max_date} is greater than min date {min_date}" + ).format(max_days=max_days, min_days=min_days, max_date=max_date, min_date=min_date) ) - sample_count = 0 - - sample_set = SampleSet( - name=name, - description=description, - ) - - for sample in samples: - # Filter sampels: import date, 'special' project type (validation etc), Merge samples, - if ( - sample.import_date > max_date - and sample.import_date <= min_date - and not sample.project.type - and len(sample.sequencing_runs) == 1 - and sample.sequencing_runs[0].platform_unit in sample.project.name - ): - sample_set.samples.append(sample) - sample_count += 1 - - if sample_count >= sample_number: - break - - if len(sample_set.samples) != sample_number: - print("Not enough samples found to create sample set, found {0} samples.".format(len(sample_set.samples))) - else: - print("Creating new random sample set:") - print("\tName: {0}".format(sample_set.name)) - print("\tDescription: {0}".format(sample_set.description)) - print("\tSamples:") - for sample in sample_set.samples: - print("\t\t{0}\t{1}\t{2}\t{3}\t{4}".format( - sample.name, - sample.type, - sample.project, - sample.sequencing_runs, - sample.import_date - )) - - confirmation = '' - while confirmation not in ['y', 'n']: - confirmation = raw_input('Please check samples and press [Y/y] to continue or [N/n] to abort. ').lower() - - if confirmation == 'y': - db.session.add(sample_set) - db.session.commit() - print("Sample set created, make sure to activate it via the admin page.") - - -class ImportAliasTable(Command): - """Import gene aliases from HGNC (ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt)""" - - def run(self): - hgnc_file = urllib.urlopen('ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt') - header = hgnc_file.readline().strip().split('\t') - for line in hgnc_file: - data = line.strip().split('\t') - - # skip lines without locus_group, refseq_accession, gene symbol or alias symbol - try: - locus_group = data[header.index('locus_group')] - refseq_accession = data[header.index('refseq_accession')] - hgnc_gene_symbol = data[header.index('symbol')] # Current hgnc gene symbol - hgnc_prev_symbols = data[header.index('prev_symbol')].strip('"') # Use only previous gene symbols as aliases - except IndexError: - continue - - # Only process protein-coding gene or non-coding RNA with 'NM' refseq_accession - if (locus_group == 'protein-coding gene' or locus_group == 'non-coding RNA') and 'NM' in refseq_accession and hgnc_prev_symbols: - # Possible gene id's - hgnc_gene_ids = [hgnc_gene_symbol] - hgnc_gene_ids.extend(hgnc_prev_symbols.split('|')) - - # Find gene in database - db_genes_ids = [] - for hgnc_gene_id in hgnc_gene_ids: - db_gene = Gene.query.get(hgnc_gene_id) - if db_gene: - db_genes_ids.append(db_gene.id) - - # Check db genes - if not db_genes_ids: - print("ERROR: No gene in database found for: {0}".format(','.join(hgnc_gene_ids))) - - # Create aliases - else: - for db_gene_id in db_genes_ids: - for hgnc_gene_id in hgnc_gene_ids: - if hgnc_gene_id not in db_genes_ids: - try: - gene_alias = GeneAlias(id=hgnc_gene_id, gene_id=db_gene_id) - db.session.add(gene_alias) - db.session.commit() - except IntegrityError: - db.session.rollback() - continue - elif hgnc_gene_id != db_gene_id: # but does exist as gene in database - print("ERROR: Can not import alias: {0} for gene: {1}".format(hgnc_gene_id, db_gene_id)) - - -class ExportAliasTable(Command): - """Print tab delimited HGNC alias / gene ID table""" - - def run(self): - print('hgnc_gene_id_alias\tgene_id') - gene_aliases = GeneAlias.query.order_by(GeneAlias.id).all() - for gene in gene_aliases: - print('{alias}\t{gene}'.format(alias=gene.id, gene=gene.gene_id)) - - -class PrintPanelBed(Command): - """Print bed file containing regions in validated active or validated archived panels. FULL_autosomal and FULL_TARGET are filtered from the list.""" - - option_list = ( - Option('-f', '--remove_flank', dest='remove_flank', default=False, action='store_true', help="Remove 20bp flank from exon coordinates."), - Option('-p', '--panel', dest='panel', help="Filter on panel name (including version, for example AMY01v19.1)."), - Option('-a', '--archived_panels', dest='active_panels', default=True, - action='store_false', help="Use archived panels instead of active panels"), + samples = ( + Sample.query + .filter(Sample.name.like('%{0}%'.format(sample_filter))) + .filter_by(type=sample_type) + .order_by(func.rand()) ) + sample_count = 0 - def run(self, remove_flank, panel, active_panels): - exons = [] - - if panel: - panel_name, version = panel.split('v') - version_year, version_revision = version.split('.') - panel_versions = ( - PanelVersion.query - .filter_by(panel_name=panel_name) - .filter_by(version_year=version_year) - .filter_by(version_revision=version_revision) - .options(joinedload('transcripts')) - ) - else: - panel_versions = ( - PanelVersion.query - .filter_by(active=active_panels) - .filter_by(validated=True) - .options(joinedload('transcripts')) - ) + sample_set = SampleSet( + name=name, + description=description, + ) - for panel in panel_versions: - if 'FULL' not in panel.panel_name: # skip FULL_autosomal and FULL_TARGET - for transcript in panel.transcripts: - for exon in transcript.exons: - if exon.id not in exons: - exons.append(exon.id) - if remove_flank: - print("{chr}\t{start}\t{end}\t{gene}".format( - chr=exon.chr, - start=exon.start + 20, # Add 20bp to remove flank - end=exon.end - 20, # Substract 20bp to remove flank - gene=transcript.gene.id - )) - else: - print("{chr}\t{start}\t{end}\t{gene}".format( - chr=exon.chr, - start=exon.start, - end=exon.end, - gene=transcript.gene.id - )) - - -class LoadDesign(Command): - """Load design files to database.""" - - def run(self): - # files - exon_file = app.config['EXON_BED_FILE'] - gene_transcript_file = app.config['GENE_TRANSCRIPT_FILE'] - preferred_transcripts_file = app.config['PREFERRED_TRANSCRIPTS_FILE'] - gene_panel_file = app.config['GENE_PANEL_FILE'] - - # Filled during parsing. - transcripts = {} - exons = [] - - # Parse Exon file - with open(exon_file, 'r') as f: - print("Loading exon file: {0}".format(exon_file)) - for line in f: - data = line.rstrip().split('\t') - chr, start, end = data[:3] - # Create exon - exon = Exon( - id='{0}_{1}_{2}'.format(chr, start, end), - chr=chr, - start=int(start), - end=int(end) - ) + for sample in samples: + # Filter sampels: import date, 'special' project type (validation etc), Merge samples, + if ( + sample.import_date > max_date + and sample.import_date <= min_date + and not sample.project.type + and len(sample.sequencing_runs) == 1 + and sample.sequencing_runs[0].platform_unit in sample.project.name + ): + sample_set.samples.append(sample) + sample_count += 1 + + if sample_count >= sample_number: + break + + if len(sample_set.samples) != sample_number: + print("Not enough samples found to create sample set, found {0} samples.".format(len(sample_set.samples))) + else: + print("Creating new random sample set:") + print("\tName: {0}".format(sample_set.name)) + print("\tDescription: {0}".format(sample_set.description)) + print("\tSamples:") + for sample in sample_set.samples: + print("\t\t{0}\t{1}\t{2}\t{3}\t{4}".format( + sample.name, + sample.type, + sample.project, + sample.sequencing_runs, + sample.import_date + )) - try: - transcript_data = set(data[6].split(':')) - except IndexError: - print("Warning: No transcripts for exon: {0}.".format(exon)) - transcript_data = [] + confirmation = '' + while confirmation not in ['y', 'n']: + confirmation = input('Please check samples and press [Y/y] to continue or [N/n] to abort. ').lower() - # Create transcripts - for transcript_name in transcript_data: - if transcript_name != 'NA': - if transcript_name in transcripts: - transcript = transcripts[transcript_name] + if confirmation == 'y': + db.session.add(sample_set) + db.session.commit() + print("Sample set created, make sure to activate it via the admin page.") - # Set start / end positions - if transcript.start > exon.start: - transcript.start = exon.start - if transcript.end < exon.end: - transcript.end = exon.end - # Sanity check chromosome - if transcript.chr != exon.chr: - print("Warning: Different chromosomes for {0} and {1}".format(transcript, exon)) +@db_cli.command('import_alias_table') +def import_alias_table(): + """Import gene aliases from HGNC (ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt)""" + hgnc_file = urllib.request.urlopen('ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt') + header = hgnc_file.readline().decode("utf-8").strip().split('\t') + for line in hgnc_file: + data = line.decode("utf-8").strip().split('\t') + # skip lines without locus_group, refseq_accession, gene symbol or alias symbol + try: + locus_group = data[header.index('locus_group')] + refseq_accession = data[header.index('refseq_accession')] + hgnc_gene_symbol = data[header.index('symbol')] # Current hgnc gene symbol + hgnc_prev_symbols = data[header.index('prev_symbol')].strip('"') # Use only previous gene symbols as aliases + except IndexError: + continue + + # Only process protein-coding gene or non-coding RNA with 'NM' refseq_accession + if (locus_group == 'protein-coding gene' or locus_group == 'non-coding RNA') and 'NM' in refseq_accession and hgnc_prev_symbols: + # Possible gene id's + hgnc_gene_ids = [hgnc_gene_symbol] + hgnc_gene_ids.extend(hgnc_prev_symbols.split('|')) + + # Find gene in database + db_genes_ids = [] + for hgnc_gene_id in hgnc_gene_ids: + db_gene = Gene.query.get(hgnc_gene_id) + if db_gene: + db_genes_ids.append(db_gene.id) + + # Check db genes + if not db_genes_ids: + print("ERROR: No gene in database found for: {0}".format(','.join(hgnc_gene_ids))) + + # Create aliases + else: + for db_gene_id in db_genes_ids: + for hgnc_gene_id in hgnc_gene_ids: + if hgnc_gene_id not in db_genes_ids: + try: + gene_alias = GeneAlias(id=hgnc_gene_id, gene_id=db_gene_id) + db.session.add(gene_alias) + db.session.commit() + except IntegrityError: + db.session.rollback() + continue + elif hgnc_gene_id != db_gene_id: # but does exist as gene in database + print("ERROR: Can not import alias: {0} for gene: {1}".format(hgnc_gene_id, db_gene_id)) + + +@db_cli.command('export_alias_table') +def export_alias_table(): + """Print tab delimited HGNC alias / gene ID table""" + print('hgnc_gene_id_alias\tgene_id') + gene_aliases = GeneAlias.query.order_by(GeneAlias.id).all() + for gene in gene_aliases: + print('{alias}\t{gene}'.format(alias=gene.id, gene=gene.gene_id)) + + +# class PrintPanelBed(Command): +# + +# option_list = ( +# Option('-f', '--remove_flank', dest='remove_flank', default=False, action='store_true', help="Remove 20bp flank from exon coordinates."), +# Option('-p', '--panel', dest='panel', help="Filter on panel name (including version, for example AMY01v19.1)."), +# Option('-a', '--archived_panels', dest='active_panels', default=True, +# action='store_false', help="Use archived panels instead of active panels"), +# ) +@db_cli.command('export_panel_bed') +@click.option('-f', '--remove_flank', default=False, is_flag=True, help="Remove 20bp flank from exon coordinates.") +@click.option('-p', '--panel', help="Filter on panel name (including version, for example AMY01v19.1).") +@click.option( + '-a', '--archived_panels', 'active_panels', default=True, is_flag=True, + help="Use archived panels instead of active panels" +) +def print_panel_bed(remove_flank, panel, active_panels): + """Print bed file containing regions in validated active or validated archived panels. + FULL_autosomal and FULL_TARGET are filtered from the list. + """ + exons = [] + + if panel: + panel_name, version = panel.split('v') + version_year, version_revision = version.split('.') + panel_versions = ( + PanelVersion.query + .filter_by(panel_name=panel_name) + .filter_by(version_year=version_year) + .filter_by(version_revision=version_revision) + .options(joinedload(PanelVersion.transcripts)) + ) + else: + panel_versions = ( + PanelVersion.query + .filter_by(active=active_panels) + .filter_by(validated=True) + .options(joinedload(PanelVersion.transcripts)) + ) + for panel in panel_versions: + if 'FULL' not in panel.panel_name: # skip FULL_autosomal and FULL_TARGET + for transcript in panel.transcripts: + for exon in transcript.exons: + if exon.id not in exons: + exons.append(exon.id) + if remove_flank: + print("{chr}\t{start}\t{end}\t{gene}".format( + chr=exon.chr, + start=exon.start + 20, # Add 20bp to remove flank + end=exon.end - 20, # Substract 20bp to remove flank + gene=transcript.gene.id + )) else: - transcript = Transcript( - name=transcript_name, + print("{chr}\t{start}\t{end}\t{gene}".format( chr=exon.chr, start=exon.start, - end=exon.end - ) - transcripts[transcript_name] = transcript - exon.transcripts.append(transcript) - exons.append(exon) - - # Bulk insert exons and transcript - bulk_insert_n = 5000 - for i in range(0, len(exons), bulk_insert_n): - db.session.add_all(exons[i:i+bulk_insert_n]) - db.session.commit() - - db.session.add_all(list(transcripts.values())) - db.session.commit() - - # Load gene and transcript file - genes = {} - with open(gene_transcript_file, 'r') as f: - print("Loading gene transcript file: {0}".format(gene_transcript_file)) - for line in f: - if not line.startswith('#'): - data = line.rstrip().split('\t') - gene_id = data[0].rstrip() - transcript_name = data[1].rstrip() - - if transcript_name in transcripts: - if gene_id in genes: - gene = genes[gene_id] - else: - gene = Gene(id=gene_id) - genes[gene_id] = gene - gene.transcripts.append(transcripts[transcript_name]) - db.session.add(gene) - else: - print("Warning: Unkown transcript {0} for gene {1}".format(transcript_name, gene_id)) - db.session.commit() - - # Setup preferred transcript dictonary - preferred_transcripts = {} # key = gene, value = transcript - with open(preferred_transcripts_file, 'r') as f: - print("Loading preferred transcripts file: {0}".format(preferred_transcripts_file)) - for line in f: - if not line.startswith('#'): - # Parse data - data = line.rstrip().split('\t') - gene = genes[data[0]] - transcript = transcripts[data[1]] - - # Set default transcript - gene.default_transcript = transcript - preferred_transcripts[gene.id] = transcript.name - db.session.add(gene) - db.session.commit() - - # Setup gene panel - with open(gene_panel_file, 'r') as f: - print("Loading gene panel file: {0}".format(gene_panel_file)) - panels = {} - for line in f: - data = line.rstrip().split('\t') - panel = data[0] - - if 'elid' not in panel: # Skip old elid panel designs - panels[panel] = data[2] - - for panel in sorted(panels.keys()): - panel_match = re.search('(\w+)v(1\d).(\d)', panel) # look for [panel_name]v[version] pattern - genes = panels[panel].split(',') - if panel_match: - panel_name = panel_match.group(1) - panel_version_year = panel_match.group(2) - panel_version_revision = panel_match.group(3) - else: - panel_name = panel - panel_version_year = time.strftime('%y') - panel_version_revision = 1 - - panel = utils.get_one_or_create( - db.session, - Panel, - name=panel_name, - )[0] - - panel_version = PanelVersion(panel_name=panel_name, version_year=panel_version_year, version_revision=panel_version_revision, active=True, validated=True, user_id=1) - - for gene in set(genes): - if gene in preferred_transcripts: - transcript = transcripts[preferred_transcripts[gene]] - panel_version.transcripts.append(transcript) - else: - print("WARNING: Unkown gene: {0}".format(gene)) - db.session.add(panel_version) - db.session.commit() - - -class ExportCovStatsSampleSet(Command): + end=exon.end, + gene=transcript.gene.id + )) + + +@db_cli.command('coverage_stats') +@click.argument('sample_set_id') +@click.argument('data_type', type=click.Choice(['panel', 'transcript'])) +@click.option( + '-m', '--measurement_type', + type=click.Choice([ + 'measurement_mean_coverage', + 'measurement_percentage10', + 'measurement_percentage15', + 'measurement_percentage20', + 'measurement_percentage30', + 'measurement_percentage50', + 'measurement_percentage100' + ]), + default='measurement_percentage15' +) +@click.option('-a', '--archived_panels', 'active_panels', default=True, is_flag=True) +def export_cov_stats_sample_set(sample_set_id, data_type, measurement_type, active_panels): """Print tab delimited coverage statistics of panel or transcript as table.""" - - option_list = ( - Option('sample_set_id', type=str), - Option('data_type', type=str, choices=["panel", "transcript"]), - Option( - '-m', '--measurement_type', - choices=[ - 'measurement_mean_coverage', - 'measurement_percentage10', - 'measurement_percentage15', - 'measurement_percentage20', - 'measurement_percentage30', - 'measurement_percentage50', - 'measurement_percentage100' - ], - dest='measurement_type', - default='measurement_percentage15' - ), - Option('-a', '--archived_panels', dest='active_panels', default=True, - action='store_false', help="Use archived panels instead of active panels"), - ) - - def run(self, sample_set_id, data_type, measurement_type, active_panels): - # retrieve samples from sampleset - try: - sample_set = SampleSet.query.options(joinedload('samples')).filter_by(id=sample_set_id).one() - except NoResultFound as e: - print("Sample set ID {id} does not exist in database.".format(id=sample_set_id)) - sys.exit(e) - sample_ids = [sample.id for sample in sample_set.samples] - - # retrieve panels, transcripts measurements per sample - query = ( - db.session.query(PanelVersion, TranscriptMeasurement) - .filter_by(active=active_panels, validated=True) - .filter(PanelVersion.panel_name.notlike("%FULL%")) - .join(Transcript, PanelVersion.transcripts) - .join(TranscriptMeasurement) - .filter(TranscriptMeasurement.sample_id.in_(sample_ids)) - .order_by( - PanelVersion.panel_name, - PanelVersion.id, - TranscriptMeasurement.transcript_id, - TranscriptMeasurement.sample_id) - .all() - ) - - if data_type == "panel": - self.retrieve_and_print_panel_measurements( - ss_samples=sample_set.samples, query=query, measurement_type=measurement_type - ) - elif data_type == "transcript": - self.retrieve_and_print_transcript_measurements(query=query, measurement_type=measurement_type) - - - def retrieve_and_print_panel_measurements(self, ss_samples, query, measurement_type): + def retrieve_and_print_panel_measurements(ss_samples, query, measurement_type): panels_measurements = OrderedDict() # retrieve panel measurements for panel, transcript_measurement in query: @@ -922,8 +709,7 @@ def retrieve_and_print_panel_measurements(self, ss_samples, query, measurement_t ) ) - - def retrieve_and_print_transcript_measurements(self, query, measurement_type): + def retrieve_and_print_transcript_measurements(query, measurement_type): panels_measurements = OrderedDict() # retrieve transcript measurements @@ -960,3 +746,184 @@ def retrieve_and_print_transcript_measurements(self, query, measurement_type): max=transcript_cov_stats[transcript]['max'] ) ) + + # retrieve samples from sampleset + try: + sample_set = SampleSet.query.options(joinedload(SampleSet.samples)).filter_by(id=sample_set_id).one() + except NoResultFound as e: + print("Sample set ID {id} does not exist in database.".format(id=sample_set_id)) + sys.exit(e) + sample_ids = [sample.id for sample in sample_set.samples] + + # retrieve panels, transcripts measurements per sample + query = ( + db.session.query(PanelVersion, TranscriptMeasurement) + .filter_by(active=active_panels, validated=True) + .filter(PanelVersion.panel_name.notlike("%FULL%")) + .join(Transcript, PanelVersion.transcripts) + .join(TranscriptMeasurement) + .filter(TranscriptMeasurement.sample_id.in_(sample_ids)) + .order_by( + PanelVersion.panel_name, + PanelVersion.id, + TranscriptMeasurement.transcript_id, + TranscriptMeasurement.sample_id) + .all() + ) + + if data_type == "panel": + retrieve_and_print_panel_measurements( + ss_samples=sample_set.samples, query=query, measurement_type=measurement_type + ) + elif data_type == "transcript": + retrieve_and_print_transcript_measurements(query=query, measurement_type=measurement_type) + + +# @db_cli.command('load_design') +# def load_design(): +# """Load design files to database.""" +# # Disabled, can be used to setup a new database from scratch +# # files +# exon_file = app.config['EXON_BED_FILE'] +# gene_transcript_file = app.config['GENE_TRANSCRIPT_FILE'] +# preferred_transcripts_file = app.config['PREFERRED_TRANSCRIPTS_FILE'] +# gene_panel_file = app.config['GENE_PANEL_FILE'] + +# # Filled during parsing. +# transcripts = {} +# exons = [] + +# # Parse Exon file +# with open(exon_file, 'r') as f: +# print("Loading exon file: {0}".format(exon_file)) +# for line in f: +# data = line.rstrip().split('\t') +# chr, start, end = data[:3] +# # Create exon +# exon = Exon( +# id='{0}_{1}_{2}'.format(chr, start, end), +# chr=chr, +# start=int(start), +# end=int(end) +# ) + +# try: +# transcript_data = set(data[6].split(':')) +# except IndexError: +# print("Warning: No transcripts for exon: {0}.".format(exon)) +# transcript_data = [] + +# # Create transcripts +# for transcript_name in transcript_data: +# if transcript_name != 'NA': +# if transcript_name in transcripts: +# transcript = transcripts[transcript_name] + +# # Set start / end positions +# if transcript.start > exon.start: +# transcript.start = exon.start +# if transcript.end < exon.end: +# transcript.end = exon.end + +# # Sanity check chromosome +# if transcript.chr != exon.chr: +# print("Warning: Different chromosomes for {0} and {1}".format(transcript, exon)) + +# else: +# transcript = Transcript( +# name=transcript_name, +# chr=exon.chr, +# start=exon.start, +# end=exon.end +# ) +# transcripts[transcript_name] = transcript +# exon.transcripts.append(transcript) +# exons.append(exon) + +# # Bulk insert exons and transcript +# bulk_insert_n = 5000 +# for i in range(0, len(exons), bulk_insert_n): +# db.session.add_all(exons[i:i+bulk_insert_n]) +# db.session.commit() + +# db.session.add_all(list(transcripts.values())) +# db.session.commit() + +# # Load gene and transcript file +# genes = {} +# with open(gene_transcript_file, 'r') as f: +# print("Loading gene transcript file: {0}".format(gene_transcript_file)) +# for line in f: +# if not line.startswith('#'): +# data = line.rstrip().split('\t') +# gene_id = data[0].rstrip() +# transcript_name = data[1].rstrip() + +# if transcript_name in transcripts: +# if gene_id in genes: +# gene = genes[gene_id] +# else: +# gene = Gene(id=gene_id) +# genes[gene_id] = gene +# gene.transcripts.append(transcripts[transcript_name]) +# db.session.add(gene) +# else: +# print("Warning: Unkown transcript {0} for gene {1}".format(transcript_name, gene_id)) +# db.session.commit() + +# # Setup preferred transcript dictonary +# preferred_transcripts = {} # key = gene, value = transcript +# with open(preferred_transcripts_file, 'r') as f: +# print("Loading preferred transcripts file: {0}".format(preferred_transcripts_file)) +# for line in f: +# if not line.startswith('#'): +# # Parse data +# data = line.rstrip().split('\t') +# gene = genes[data[0]] +# transcript = transcripts[data[1]] + +# # Set default transcript +# gene.default_transcript = transcript +# preferred_transcripts[gene.id] = transcript.name +# db.session.add(gene) +# db.session.commit() + +# # Setup gene panel +# with open(gene_panel_file, 'r') as f: +# print("Loading gene panel file: {0}".format(gene_panel_file)) +# panels = {} +# for line in f: +# data = line.rstrip().split('\t') +# panel = data[0] + +# if 'elid' not in panel: # Skip old elid panel designs +# panels[panel] = data[2] + +# for panel in sorted(panels.keys()): +# panel_match = re.search('(\w+)v(1\d).(\d)', panel) # look for [panel_name]v[version] pattern +# genes = panels[panel].split(',') +# if panel_match: +# panel_name = panel_match.group(1) +# panel_version_year = panel_match.group(2) +# panel_version_revision = panel_match.group(3) +# else: +# panel_name = panel +# panel_version_year = time.strftime('%y') +# panel_version_revision = 1 + +# panel = utils.get_one_or_create( +# db.session, +# Panel, +# name=panel_name, +# )[0] + +# panel_version = PanelVersion(panel_name=panel_name, version_year=panel_version_year, version_revision=panel_version_revision, active=True, validated=True, user_id=1) + +# for gene in set(genes): +# if gene in preferred_transcripts: +# transcript = transcripts[preferred_transcripts[gene]] +# panel_version.transcripts.append(transcript) +# else: +# print("WARNING: Unkown gene: {0}".format(gene)) +# db.session.add(panel_version) +# db.session.commit() \ No newline at end of file diff --git a/README.md b/README.md index 67b599d..d4a21a7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ # ExonCov + ExonCov: Exon coverage statistics from BAM files ### Requirements + - [Python 3](https://www.python.org/) - [Virtualenv](https://virtualenv.pypa.io/en/stable/) - [MYSQL](https://www.mysql.com/) @@ -9,18 +11,22 @@ ExonCov: Exon coverage statistics from BAM files - [Dx Tracks](https://github.com/UMCUGenetics/Dx_tracks) ### Setup + ```bash # Clone git repository git clone git@github.com:UMCUGenetics/ExonCov.git cd ExonCov # Setup python virtual environment and install python dependencies -python3 -m venv venv +python3.11 -m venv venv source venv/bin/activate pip install -r requirements.txt ``` + #### Edit config.py + Change the following lines in config.py + ```python SQLALCHEMY_DATABASE_URI = 'mysql://:@localhost/exoncov3' #or 'mysql+mysqlconnector://' SECRET_KEY = @@ -37,25 +43,30 @@ EXON_MEASUREMENTS_RSYNC_PATH = 'rsync/path/for/data' ``` ### Load design + ```bash source venv/bin/activate python ExonCov.py load_design ``` ### Import bam file + ```bash source venv/bin/activate -python ExonCov.py import_bam +flask --app ExonCov import_bam ``` ### Run development webserver + ```bash source venv/bin/activate -python ExonCov.py runserver -r -d +flask --app ExonCov run --debug ``` ### Export and Import existing ExonCov db + Ignore large tables, samples should be imported using cli. + ```bash mysqldump --user= --password --no-data --tab= exoncov @@ -77,6 +88,7 @@ mysql --init-command="SET SESSION FOREIGN_KEY_CHECKS=0;" --user=exoncov --passwo ``` Execute following mysql statements to import data from txt files. + ```mysql LOAD DATA LOCAL INFILE 'alembic_version.txt' INTO TABLE alembic_version; LOAD DATA LOCAL INFILE 'exons.txt' INTO TABLE exons; From 3d4507b025f69a184454b2796b1c32193f97e498 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 14 Mar 2024 14:13:01 +0100 Subject: [PATCH 09/13] Add gunicorn --- README.md | 7 +++++++ requirements.txt | 5 +++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d4a21a7..2c976d6 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,13 @@ source venv/bin/activate flask --app ExonCov run --debug ``` +### Run production webserver + +```bash +source venv/bin/activate +gunicorn -w 4 ExonCov:app +``` + ### Export and Import existing ExonCov db Ignore large tables, samples should be imported using cli. diff --git a/requirements.txt b/requirements.txt index 529b734..19147bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,10 +3,11 @@ Flask==2.2.5 Flask-Admin==1.6.1 Flask-DebugToolbar==0.14.1 Flask-Migrate==4.0.7 +Flask-Security-Too==4.1.6 Flask-SQLAlchemy==3.1.1 Flask-WTF==1.2.1 +gunicorn==21.2.0 mysql-connector-python==8.3.0 pysam==0.22.0 pytz==2024.1 -WTForms==2.3.3 -Flask-Security-Too==4.1.6 \ No newline at end of file +WTForms==2.3.3 \ No newline at end of file From 4a361efacdbbaacbb3ba1a405f6906b766ce305a Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 14 Mar 2024 14:43:31 +0100 Subject: [PATCH 10/13] Remove wsgi --- exoncov.wsgi | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100755 exoncov.wsgi diff --git a/exoncov.wsgi b/exoncov.wsgi deleted file mode 100755 index 8ce08c6..0000000 --- a/exoncov.wsgi +++ /dev/null @@ -1,14 +0,0 @@ -"""ExonCov wsgi.""" -activate_this = '/var/www/html/exoncov/venv/bin/activate_this.py' -execfile(activate_this, dict(__file__=activate_this)) - -# Add app to the path -import sys -sys.path.insert(0, "/var/www/html/exoncov") - -# Load app -from ExonCov import app as application -from ExonCov.utils import WSGIMiddleware - -# Setup prefix -application.wsgi_app = WSGIMiddleware(application.wsgi_app, "") From f0266d090a4a459a5779613f58486d8237394b35 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 14 Mar 2024 14:51:59 +0100 Subject: [PATCH 11/13] Update migration --- .../743f453ff85b_add_fs_uniquifier.py | 59 ------------------- 1 file changed, 59 deletions(-) diff --git a/migrations/versions/743f453ff85b_add_fs_uniquifier.py b/migrations/versions/743f453ff85b_add_fs_uniquifier.py index 4b36ac1..c321c00 100644 --- a/migrations/versions/743f453ff85b_add_fs_uniquifier.py +++ b/migrations/versions/743f453ff85b_add_fs_uniquifier.py @@ -7,7 +7,6 @@ """ from alembic import op import sqlalchemy as sa -from sqlalchemy.dialects import mysql # revision identifiers, used by Alembic. revision = '743f453ff85b' @@ -17,68 +16,10 @@ def upgrade(): - # ### commands auto generated by Alembic - please adjust! ### - # with op.batch_alter_table('event_logs', schema=None) as batch_op: - # batch_op.create_index(batch_op.f('ix_event_logs_user_id'), ['user_id'], unique=False) - - # with op.batch_alter_table('exons', schema=None) as batch_op: - # batch_op.drop_index('ix_exons_chr') - - # with op.batch_alter_table('sample_projects', schema=None) as batch_op: - # batch_op.alter_column('name', - # existing_type=mysql.VARCHAR(length=255), - # type_=sa.String(length=50), - # existing_nullable=False) - # batch_op.alter_column('type', - # existing_type=mysql.VARCHAR(length=255), - # nullable=True) - - # with op.batch_alter_table('sequencing_runs', schema=None) as batch_op: - # batch_op.alter_column('name', - # existing_type=mysql.VARCHAR(length=255), - # type_=sa.String(length=50), - # existing_nullable=True) - - # with op.batch_alter_table('transcript_measurements', schema=None) as batch_op: - # batch_op.create_unique_constraint(None, ['transcript_id', 'sample_id']) - # batch_op.create_foreign_key(None, 'transcripts', ['transcript_id'], ['id']) - with op.batch_alter_table('user', schema=None) as batch_op: batch_op.add_column(sa.Column('fs_uniquifier', sa.String(length=255), nullable=False)) - # batch_op.create_unique_constraint(None, ['fs_uniquifier']) - - # ### end Alembic commands ### def downgrade(): - # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table('user', schema=None) as batch_op: - # batch_op.drop_constraint(None, type_='unique') batch_op.drop_column('fs_uniquifier') - - # with op.batch_alter_table('transcript_measurements', schema=None) as batch_op: - # batch_op.drop_constraint(None, type_='foreignkey') - # batch_op.drop_constraint(None, type_='unique') - - # with op.batch_alter_table('sequencing_runs', schema=None) as batch_op: - # batch_op.alter_column('name', - # existing_type=sa.String(length=50), - # type_=mysql.VARCHAR(length=255), - # existing_nullable=True) - - # with op.batch_alter_table('sample_projects', schema=None) as batch_op: - # batch_op.alter_column('type', - # existing_type=mysql.VARCHAR(length=255), - # nullable=False) - # batch_op.alter_column('name', - # existing_type=sa.String(length=50), - # type_=mysql.VARCHAR(length=255), - # existing_nullable=False) - - # with op.batch_alter_table('exons', schema=None) as batch_op: - # batch_op.create_index('ix_exons_chr', ['chr'], unique=False) - - # with op.batch_alter_table('event_logs', schema=None) as batch_op: - # batch_op.drop_index(batch_op.f('ix_event_logs_user_id')) - - # ### end Alembic commands ### From df66b95f09a406640cf26d3ef815fe467d3b05bf Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 15 Mar 2024 07:46:21 +0100 Subject: [PATCH 12/13] Add fs_uniquifier with uuid --- ExonCov/cli.py | 2 +- ExonCov/models.py | 3 ++- config.py | 2 +- migrations/versions/743f453ff85b_add_fs_uniquifier.py | 10 ++++++++++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/ExonCov/cli.py b/ExonCov/cli.py index c19fc5c..81a766a 100644 --- a/ExonCov/cli.py +++ b/ExonCov/cli.py @@ -255,7 +255,7 @@ def import_bam(project_name, sample_type, bam, exon_bed_file, threads, overwrite } # Set transcript measurements - transcripts = Transcript.query.options(joinedload('exons')).all() + transcripts = Transcript.query.options(joinedload(Transcript.exons)).all() transcripts_measurements = {} for transcript in transcripts: diff --git a/ExonCov/models.py b/ExonCov/models.py index 0bdee4b..eaa02b4 100644 --- a/ExonCov/models.py +++ b/ExonCov/models.py @@ -415,13 +415,14 @@ class User(db.Model, UserMixin): roles = db.relationship('Role', secondary=roles_users, lazy='joined', backref=db.backref('users')) - def __init__(self, email, password, first_name, last_name, active, roles): + def __init__(self, email, password, first_name, last_name, active, roles, fs_uniquifier): self.email = email self.password = password self.first_name = first_name self.last_name = last_name self.active = False self.roles = roles + self.fs_uniquifier = fs_uniquifier def __repr__(self): return "User({0}-{1})".format(self.id, self.email) diff --git a/config.py b/config.py index 5e362b2..11449a1 100644 --- a/config.py +++ b/config.py @@ -1,7 +1,7 @@ """ExonCov configuration.""" # SQLAlchemy -SQLALCHEMY_DATABASE_URI = 'mysql://exoncov3:exoncov3@localhost/exoncov3' +SQLALCHEMY_DATABASE_URI = 'mysql+mysqlconnector://exoncov3:exoncov3@localhost/exoncov3' SQLALCHEMY_ECHO = False SQLALCHEMY_TRACK_MODIFICATIONS = False diff --git a/migrations/versions/743f453ff85b_add_fs_uniquifier.py b/migrations/versions/743f453ff85b_add_fs_uniquifier.py index c321c00..aa2b3ff 100644 --- a/migrations/versions/743f453ff85b_add_fs_uniquifier.py +++ b/migrations/versions/743f453ff85b_add_fs_uniquifier.py @@ -7,6 +7,7 @@ """ from alembic import op import sqlalchemy as sa +import uuid # revision identifiers, used by Alembic. revision = '743f453ff85b' @@ -19,7 +20,16 @@ def upgrade(): with op.batch_alter_table('user', schema=None) as batch_op: batch_op.add_column(sa.Column('fs_uniquifier', sa.String(length=255), nullable=False)) + connection = op.get_bind() + for user in connection.execute(sa.text('SELECT id FROM user')): + fs_uniquifier = uuid.uuid4().hex + connection.execute(sa.text(f"UPDATE user SET fs_uniquifier = '{fs_uniquifier}' WHERE id = {user[0]}")) + + with op.batch_alter_table('user', schema=None) as batch_op: + batch_op.create_unique_constraint(None, ['fs_uniquifier']) + def downgrade(): with op.batch_alter_table('user', schema=None) as batch_op: + batch_op.drop_constraint('fs_uniquifier', type_='unique') batch_op.drop_column('fs_uniquifier') From 73888376485d9fb013cc11d9b23a24535b569e2f Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Mon, 25 Mar 2024 14:20:41 +0100 Subject: [PATCH 13/13] Review comments, mainly flake8 long lines --- ExonCov/cli.py | 101 +++++++++++++----------- ExonCov/forms.py | 16 +++- ExonCov/models.py | 8 +- ExonCov/utils.py | 6 +- ExonCov/views.py | 190 +++++++++++++++++++++++++++++++++++++++------- 5 files changed, 237 insertions(+), 84 deletions(-) diff --git a/ExonCov/cli.py b/ExonCov/cli.py index 81a766a..7419e4d 100644 --- a/ExonCov/cli.py +++ b/ExonCov/cli.py @@ -1,8 +1,6 @@ """CLI functions.""" from collections import OrderedDict import sys -import re -import time from subprocess import run as subprocess_run, PIPE, Popen, CalledProcessError import shlex import urllib.request @@ -55,12 +53,12 @@ def print_stats(): @db_cli.command('panel_genes') -@click.option('--archived_panels', '-a', default=True, is_flag=True) -def print_panel_genes_table(archived_panels): +@click.option('-a', '--archived_panels', 'active_panels', default=True, is_flag=True) +def print_panel_genes_table(active_panels): """Print tab delimited panel / genes table.""" - print('{panel}\t{release_date}\t{genes}'.format(panel='panel_version', release_date='release_date', genes='genes')) + print('panel_version\trelease_date\tgenes') - panel_versions = PanelVersion.query.filter_by(active=archived_panels).options(joinedload(PanelVersion.transcripts)) + panel_versions = PanelVersion.query.filter_by(active=active_panels).options(joinedload(PanelVersion.transcripts)) for panel in panel_versions: print('{panel}\t{release_date}\t{genes}'.format( @@ -71,7 +69,7 @@ def print_panel_genes_table(archived_panels): @db_cli.command('gene_transcripts') -@click.option('--preferred_transcripts', '-p', is_flag=True) +@click.option('-p', '--preferred_transcripts', is_flag=True) def print_transcripts(preferred_transcripts): """Print tab delimited gene / transcript table""" print('{gene}\t{transcript}'.format(gene='Gene', transcript='Transcript')) @@ -120,7 +118,7 @@ def import_bam(project_name, sample_type, bam, exon_bed_file, threads, overwrite db.session, SequencingRun, platform_unit=read_group['PU'] - ) # returns object and exists bool + ) # Returns object and exists bool sequencing_runs[read_group['PU']] = sequencing_run sequencing_run_ids.append(sequencing_run.id) @@ -135,7 +133,7 @@ def import_bam(project_name, sample_type, bam, exon_bed_file, threads, overwrite SampleProject, name=project_name, type='' - ) # returns object and exists bool + ) # Returns object and exists bool # Look for sample in database sample = Sample.query.filter_by(name=sample_name).filter_by(project_id=sample_project.id).first() @@ -146,13 +144,19 @@ def import_bam(project_name, sample_type, bam, exon_bed_file, threads, overwrite sys.exit("ERROR: Sample and project combination already exists.\t{0}".format(sample)) # Create sambamba command - sambamba_command = "{sambamba} depth region {bam_file} --nthreads {threads} --filter '{filter}' --regions {bed_file} {settings}".format( + sambamba_command = ( + "{sambamba} depth region {bam_file} --nthreads {threads} --filter '{filter}' --regions {bed_file} {settings}" + ).format( sambamba=app.config['SAMBAMBA'], bam_file=bam, threads=threads, filter=app.config['SAMBAMBA_FILTER'], bed_file=exon_bed_file, - settings='--fix-mate-overlaps --min-base-quality 10 --cov-threshold 10 --cov-threshold 15 --cov-threshold 20 --cov-threshold 30 --cov-threshold 50 --cov-threshold 100', + settings=( + '--fix-mate-overlaps --min-base-quality 10 ' + '--cov-threshold 10 --cov-threshold 15 --cov-threshold 20 ' + '--cov-threshold 30 --cov-threshold 50 --cov-threshold 100' + ) ) # Create sample @@ -179,7 +183,7 @@ def import_bam(project_name, sample_type, bam, exon_bed_file, threads, overwrite # Create temp_dir if not temp_path: temp_dir = tempfile.mkdtemp() - else: # assume path exist and user is responsible for this directory + else: # Assume path exist and user is responsible for this directory temp_dir = temp_path # Run sambamba @@ -275,7 +279,11 @@ def import_bam(project_name, sample_type, bam, exon_bed_file, threads, overwrite 'measurement_percentage100': exon_measurement['measurement_percentage100'], } else: - measurement_types = ['measurement_mean_coverage', 'measurement_percentage10', 'measurement_percentage15', 'measurement_percentage20', 'measurement_percentage30', 'measurement_percentage50', 'measurement_percentage100'] + measurement_types = [ + 'measurement_mean_coverage', + 'measurement_percentage10', 'measurement_percentage15', 'measurement_percentage20', + 'measurement_percentage30', 'measurement_percentage50', 'measurement_percentage100' + ] for measurement_type in measurement_types: transcripts_measurements[transcript.id][measurement_type] = utils.weighted_average( values=[transcripts_measurements[transcript.id][measurement_type], exon_measurement[measurement_type]], @@ -410,7 +418,7 @@ def print_result(sample, panel, panel_min_15x='', panel_15x='', panel_qc='', cor panel_qc=panel_qc, core_gene_qc=core_gene_qc ) - else: # sample and/or panel not found + else: # Sample and/or panel not found sample_msg = 'unknown_sample={0}'.format(sample_id) panel_msg = 'unknown_panel={0}'.format(panels[index]) @@ -447,10 +455,16 @@ def check_samples(): error = False transcript_count = Transcript.query.count() - sample_transcript_count = db.session.query(Sample.name, func.count(TranscriptMeasurement.id)).outerjoin(TranscriptMeasurement).group_by(Sample.id).all() + sample_transcript_count = ( + db.session.query(Sample.name, func.count(TranscriptMeasurement.id)) + .outerjoin(TranscriptMeasurement) + .group_by(Sample.id).all() + ) for sample_count in sample_transcript_count: if sample_count[1] != transcript_count: - print("ERROR: Sample:{0} TranscriptMeasurement:{1} Exons:{2}".format(sample_count[0], sample_count[1], transcript_count)) + print("ERROR: Sample:{0} TranscriptMeasurement:{1} Exons:{2}".format( + sample_count[0], sample_count[1], transcript_count + )) error = True if not error: @@ -540,7 +554,7 @@ def import_alias_table(): header = hgnc_file.readline().decode("utf-8").strip().split('\t') for line in hgnc_file: data = line.decode("utf-8").strip().split('\t') - # skip lines without locus_group, refseq_accession, gene symbol or alias symbol + # Skip lines without locus_group, refseq_accession, gene symbol or alias symbol try: locus_group = data[header.index('locus_group')] refseq_accession = data[header.index('refseq_accession')] @@ -550,7 +564,11 @@ def import_alias_table(): continue # Only process protein-coding gene or non-coding RNA with 'NM' refseq_accession - if (locus_group == 'protein-coding gene' or locus_group == 'non-coding RNA') and 'NM' in refseq_accession and hgnc_prev_symbols: + if ( + (locus_group == 'protein-coding gene' or locus_group == 'non-coding RNA') + and 'NM' in refseq_accession + and hgnc_prev_symbols + ): # Possible gene id's hgnc_gene_ids = [hgnc_gene_symbol] hgnc_gene_ids.extend(hgnc_prev_symbols.split('|')) @@ -578,7 +596,7 @@ def import_alias_table(): except IntegrityError: db.session.rollback() continue - elif hgnc_gene_id != db_gene_id: # but does exist as gene in database + elif hgnc_gene_id != db_gene_id: # Does exist as gene in database print("ERROR: Can not import alias: {0} for gene: {1}".format(hgnc_gene_id, db_gene_id)) @@ -591,17 +609,8 @@ def export_alias_table(): print('{alias}\t{gene}'.format(alias=gene.id, gene=gene.gene_id)) -# class PrintPanelBed(Command): -# - -# option_list = ( -# Option('-f', '--remove_flank', dest='remove_flank', default=False, action='store_true', help="Remove 20bp flank from exon coordinates."), -# Option('-p', '--panel', dest='panel', help="Filter on panel name (including version, for example AMY01v19.1)."), -# Option('-a', '--archived_panels', dest='active_panels', default=True, -# action='store_false', help="Use archived panels instead of active panels"), -# ) @db_cli.command('export_panel_bed') -@click.option('-f', '--remove_flank', default=False, is_flag=True, help="Remove 20bp flank from exon coordinates.") +@click.option('-f', '--remove_flank', is_flag=True, help="Remove 20bp flank from exon coordinates.") @click.option('-p', '--panel', help="Filter on panel name (including version, for example AMY01v19.1).") @click.option( '-a', '--archived_panels', 'active_panels', default=True, is_flag=True, @@ -632,7 +641,7 @@ def print_panel_bed(remove_flank, panel, active_panels): ) for panel in panel_versions: - if 'FULL' not in panel.panel_name: # skip FULL_autosomal and FULL_TARGET + if 'FULL' not in panel.panel_name: # Skip FULL_autosomal and FULL_TARGET for transcript in panel.transcripts: for exon in transcript.exons: if exon.id not in exons: @@ -674,13 +683,13 @@ def export_cov_stats_sample_set(sample_set_id, data_type, measurement_type, acti """Print tab delimited coverage statistics of panel or transcript as table.""" def retrieve_and_print_panel_measurements(ss_samples, query, measurement_type): panels_measurements = OrderedDict() - # retrieve panel measurements + # Retrieve panel measurements for panel, transcript_measurement in query: sample = transcript_measurement.sample - # add new panel + # Add new panel if panel not in panels_measurements: panels_measurements[panel] = OrderedDict() - # add new sample or calculate weighted avg + # Add new sample or calculate weighted avg if sample not in panels_measurements[panel]: panels_measurements[panel][sample] = { 'len': transcript_measurement.len, @@ -693,7 +702,7 @@ def retrieve_and_print_panel_measurements(ss_samples, query, measurement_type): ) panels_measurements[panel][sample]['len'] += transcript_measurement.len - # print header + # Print header print("panel_version\tmeasurement_type\tmean\tmin\tmax") for panel in panels_measurements: panel_cov_stats = utils.get_summary_stats_multi_sample( @@ -712,29 +721,29 @@ def retrieve_and_print_panel_measurements(ss_samples, query, measurement_type): def retrieve_and_print_transcript_measurements(query, measurement_type): panels_measurements = OrderedDict() - # retrieve transcript measurements + # Retrieve transcript measurements for panel, transcript_measurement in query: sample = transcript_measurement.sample transcript = transcript_measurement.transcript - # add new panel + # Add new panel if panel not in panels_measurements: panels_measurements[panel] = OrderedDict() - # add new transcript + # Add new transcript if transcript not in panels_measurements[panel]: panels_measurements[panel][transcript] = {} - # add new sample + # Add new sample if sample not in panels_measurements[panel][transcript]: panels_measurements[panel][transcript][sample] = transcript_measurement[measurement_type] - # print header + # Print header print("panel_version\ttranscript_id\tgene_id\tmeasurement_type\tmean\tmin\tmax") for panel in panels_measurements: for transcript in panels_measurements[panel]: transcript_cov_stats = utils.get_summary_stats_multi_sample( measurements=panels_measurements[panel], keys=[transcript] ) - # print summary + # Print summary print( "{panel_version}\t{transcript}\t{gene}\t{measurement_type}\t{mean}\t{min}\t{max}".format( panel_version=panel, @@ -747,7 +756,7 @@ def retrieve_and_print_transcript_measurements(query, measurement_type): ) ) - # retrieve samples from sampleset + # Retrieve samples from sampleset try: sample_set = SampleSet.query.options(joinedload(SampleSet.samples)).filter_by(id=sample_set_id).one() except NoResultFound as e: @@ -755,7 +764,7 @@ def retrieve_and_print_transcript_measurements(query, measurement_type): sys.exit(e) sample_ids = [sample.id for sample in sample_set.samples] - # retrieve panels, transcripts measurements per sample + # Retrieve panels, transcripts measurements per sample query = ( db.session.query(PanelVersion, TranscriptMeasurement) .filter_by(active=active_panels, validated=True) @@ -778,11 +787,13 @@ def retrieve_and_print_transcript_measurements(query, measurement_type): elif data_type == "transcript": retrieve_and_print_transcript_measurements(query=query, measurement_type=measurement_type) - +# Disabled, can be used to setup a new database from scratch +# Needs imports: +# import re +# import time # @db_cli.command('load_design') # def load_design(): # """Load design files to database.""" -# # Disabled, can be used to setup a new database from scratch # # files # exon_file = app.config['EXON_BED_FILE'] # gene_transcript_file = app.config['GENE_TRANSCRIPT_FILE'] @@ -926,4 +937,4 @@ def retrieve_and_print_transcript_measurements(query, measurement_type): # else: # print("WARNING: Unkown gene: {0}".format(gene)) # db.session.add(panel_version) -# db.session.commit() \ No newline at end of file +# db.session.commit() diff --git a/ExonCov/forms.py b/ExonCov/forms.py index 904c562..f245337 100644 --- a/ExonCov/forms.py +++ b/ExonCov/forms.py @@ -216,7 +216,9 @@ class CreatePanelForm(FlaskForm): description="List of genes seperated by newline, space, tab, ',' or ';'.", validators=[validators.InputRequired()] ) - core_gene_list = TextAreaField('Core gene list', description="List of core genes seperated by newline, space, tab, ',' or ';'.") + core_gene_list = TextAreaField( + 'Core gene list', description="List of core genes seperated by newline, space, tab, ',' or ';'." + ) coverage_requirement_15 = FloatField('Minimal % 15x', default=99, validators=[validators.InputRequired()]) disease_description_eng = StringField('Disease description', validators=[validators.InputRequired()]) disease_description_nl = StringField('Ziekteomschrijving', validators=[validators.InputRequired()]) @@ -256,7 +258,9 @@ def validate(self, extra_validators=[]): if self.name.data: panel = Panel.query.filter_by(name=self.name.data).first() if panel: - self.name.errors.append('Panel already exists, use the update button on the panel page to create a new version.') + self.name.errors.append( + 'Panel already exists, use the update button on the panel page to create a new version.' + ) if self.gene_list.errors or self.core_gene_list.errors or self.name.errors: return False @@ -272,7 +276,9 @@ class PanelNewVersionForm(FlaskForm): description="List of genes seperated by newline, space, tab, ',' or ';'.", validators=[validators.InputRequired()] ) - core_gene_list = TextAreaField('Core gene list', description="List of core genes seperated by newline, space, tab, ',' or ';'.") + core_gene_list = TextAreaField( + 'Core gene list', description="List of core genes seperated by newline, space, tab, ',' or ';'." + ) coverage_requirement_15 = FloatField('Minimal % 15x') comments = TextAreaField('Comments', description="Provide a short description.", validators=[validators.InputRequired()]) confirm = BooleanField('Confirm') @@ -329,7 +335,9 @@ class PanelVersionEditForm(FlaskForm): active = BooleanField('Active') validated = BooleanField('Validated') coverage_requirement_15 = FloatField('Minimal % 15x') - core_gene_list = TextAreaField('Core gene list', description="List of core genes seperated by newline, space, tab, ',' or ';'.") + core_gene_list = TextAreaField( + 'Core gene list', description="List of core genes seperated by newline, space, tab, ',' or ';'." + ) core_genes = [] diff --git a/ExonCov/models.py b/ExonCov/models.py index eaa02b4..0d78153 100644 --- a/ExonCov/models.py +++ b/ExonCov/models.py @@ -9,7 +9,7 @@ from . import db -# association tables +# Association tables exons_transcripts = db.Table( 'exons_transcripts', db.Column('exon_id', db.ForeignKey('exons.id'), primary_key=True), @@ -118,7 +118,7 @@ class Gene(db.Model): __tablename__ = 'genes' - id = db.Column(db.String(50, collation='utf8_bin'), primary_key=True) # hgnc + id = db.Column(db.String(50, collation='utf8_bin'), primary_key=True) # HGNC default_transcript_id = db.Column( db.Integer(), db.ForeignKey('transcripts.id', name='default_transcript_foreign_key'), @@ -139,7 +139,7 @@ class GeneAlias(db.Model): __tablename__ = 'gene_aliases' - id = db.Column(db.String(50, collation='utf8_bin'), primary_key=True) # hgnc + id = db.Column(db.String(50, collation='utf8_bin'), primary_key=True) # HGNC gene_id = db.Column(db.String(50, collation='utf8_bin'), db.ForeignKey('genes.id'), primary_key=True) gene = db.relationship('Gene', backref='aliases', foreign_keys=[gene_id]) @@ -230,7 +230,7 @@ class CustomPanel(db.Model): id = db.Column(db.Integer(), primary_key=True) date = db.Column(db.Date(), default=datetime.date.today, nullable=False, index=True) - research_number = db.Column(db.String(255), server_default='') # onderzoeksnummer @ lab + research_number = db.Column(db.String(255), server_default='') # Onderzoeksnummer @ lab comments = db.Column(db.Text()) user_id = db.Column(db.Integer(), db.ForeignKey('user.id'), nullable=False, index=True) validated = db.Column(db.Boolean, index=True, default=False) diff --git a/ExonCov/utils.py b/ExonCov/utils.py index 705f0dc..dcc7633 100644 --- a/ExonCov/utils.py +++ b/ExonCov/utils.py @@ -71,7 +71,7 @@ def event_logger(connection, log_model, model_name, action, event_data): def get_summary_stats(values): - return(min(values), max(values), float(sum(values)) / len(values)) + return (min(values), max(values), float(sum(values)) / len(values)) def get_summary_stats_multi_sample(measurements, keys=None, samples=None): @@ -82,12 +82,12 @@ def get_summary_stats_multi_sample(measurements, keys=None, samples=None): for key in keys: if samples and 'samples' in measurements[key]: - # when measurements scope is panel + # When measurements scope is panel values = [measurements[key]['samples'][sample]['measurement'] for sample in samples] elif samples: values = [measurements[key][sample]['measurement'] for sample in samples] else: - # when measurements scope is transcript or exon + # When measurements scope is transcript or exon values = measurements[key].values() measurements[key]['min'], measurements[key]['max'], measurements[key]['mean'] = get_summary_stats(values) return measurements diff --git a/ExonCov/views.py b/ExonCov/views.py index ae652a1..96a1fe0 100644 --- a/ExonCov/views.py +++ b/ExonCov/views.py @@ -14,7 +14,7 @@ from . import app, db from .models import ( Sample, SampleProject, SampleSet, SequencingRun, PanelVersion, Panel, CustomPanel, Gene, Transcript, - TranscriptMeasurement, panels_transcripts, Exon + TranscriptMeasurement, panels_transcripts ) from .forms import ( MeasurementTypeForm, CustomPanelForm, CustomPanelNewForm, CustomPanelValidateForm, SampleForm, @@ -57,7 +57,16 @@ def samples(): if project: samples = samples.join(SampleProject).filter(SampleProject.name.like('%{0}%'.format(project))) if run: - samples = samples.join(SequencingRun, Sample.sequencing_runs).filter(or_(SequencingRun.name.like('%{0}%'.format(run)), SequencingRun.platform_unit.like('%{0}%'.format(run)))) + samples = ( + samples + .join(SequencingRun, Sample.sequencing_runs) + .filter( + or_( + SequencingRun.name.like('%{0}%'.format(run)), + SequencingRun.platform_unit.like('%{0}%'.format(run)) + ) + ) + ) samples = samples.paginate(page=page, per_page=samples_per_page) else: samples = samples.paginate(page=page, per_page=samples_per_page) @@ -202,10 +211,16 @@ def sample_panel(sample_id, panel_id): weights=[tm[1].len for tm in transcript_measurements] ), 'core_genes': ', '.join( - ['{}({}) = {:.2f}%'.format(tm[0].gene, tm[0], tm[1].measurement_percentage15) for tm in transcript_measurements if tm[0].gene in panel.core_genes and tm[1].measurement_percentage15 < 100] + [ + '{}({}) = {:.2f}%'.format(tm[0].gene, tm[0], tm[1].measurement_percentage15) + for tm in transcript_measurements if tm[0].gene in panel.core_genes and tm[1].measurement_percentage15 < 100 + ] ), 'genes_15': ', '.join( - ['{}({}) = {:.2f}%'.format(tm[0].gene, tm[0], tm[1].measurement_percentage15) for tm in transcript_measurements if tm[0].gene not in panel.core_genes and tm[1].measurement_percentage15 < 95] + [ + '{}({}) = {:.2f}%'.format(tm[0].gene, tm[0], tm[1].measurement_percentage15) + for tm in transcript_measurements if tm[0].gene not in panel.core_genes and tm[1].measurement_percentage15 < 95 + ] ) } @@ -250,7 +265,13 @@ def sample_transcript(sample_id, transcript_name): 'measurement_percentage30': '>30', 'measurement_percentage100': '>100', } - return render_template('sample_transcript.html', sample=sample, transcript=transcript, exon_measurements=exon_measurements, measurement_types=measurement_types) + return render_template( + 'sample_transcript.html', + sample=sample, + transcript=transcript, + exon_measurements=exon_measurements, + measurement_types=measurement_types + ) @app.route('/sample//gene/') @@ -267,9 +288,22 @@ def sample_gene(sample_id, gene_id): 'measurement_percentage30': '>30', 'measurement_percentage100': '>100', } - transcript_measurements = db.session.query(Transcript, TranscriptMeasurement).filter(Transcript.gene_id == gene.id).join(TranscriptMeasurement).filter_by(sample_id=sample.id).options(joinedload(Transcript.exons, innerjoin=True)).all() + transcript_measurements = ( + db.session.query(Transcript, TranscriptMeasurement) + .filter(Transcript.gene_id == gene.id) + .join(TranscriptMeasurement) + .filter_by(sample_id=sample.id) + .options(joinedload(Transcript.exons, innerjoin=True)) + .all() + ) - return render_template('sample_gene.html', sample=sample, gene=gene, transcript_measurements=transcript_measurements, measurement_types=measurement_types) + return render_template( + 'sample_gene.html', + sample=sample, + gene=gene, + transcript_measurements=transcript_measurements, + measurement_types=measurement_types + ) @app.route('/panel') @@ -284,7 +318,12 @@ def panels(): @login_required def panel(name): """Panel page.""" - panel = Panel.query.filter_by(name=name).options(joinedload(Panel.versions).joinedload(PanelVersion.transcripts)).first_or_404() + panel = ( + Panel.query + .filter_by(name=name) + .options(joinedload(Panel.versions).joinedload(PanelVersion.transcripts)) + .first_or_404() + ) return render_template('panel.html', panel=panel) @@ -293,7 +332,12 @@ def panel(name): @roles_required('panel_admin') def panel_new_version(name): """Create new panel version page.""" - panel = Panel.query.filter_by(name=name).options(joinedload(Panel.versions).joinedload(PanelVersion.transcripts)).first_or_404() + panel = ( + Panel.query + .filter_by(name=name) + .options(joinedload(Panel.versions).joinedload(PanelVersion.transcripts)) + .first_or_404() + ) panel_last_version = panel.last_version genes = '\n'.join([transcript.gene_id for transcript in panel_last_version.transcripts]) @@ -527,7 +571,10 @@ def custom_panel(id): sample_ids = [sample.id for sample in custom_panel.samples] transcript_ids = [transcript.id for transcript in custom_panel.transcripts] - measurement_type = [measurement_type_form.data['measurement_type'], dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type'])] + measurement_type = [ + measurement_type_form.data['measurement_type'], + dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type']) + ] transcript_measurements = {} panel_measurements = {} sample_stats = {sample: [[], {}] for sample in custom_panel.samples} @@ -552,7 +599,9 @@ def custom_panel(id): if transcript_measurement[measurement_type[0]] == 100: sample_stats[sample][0].append('{0}({1})'.format(transcript.gene.id, transcript.name)) else: - sample_stats[sample][1]['{0}({1})'.format(transcript.gene.id, transcript.name)] = transcript_measurement[measurement_type[0]] + sample_stats[sample][1]['{0}({1})'.format( + transcript.gene.id, transcript.name + )] = transcript_measurement[measurement_type[0]] # Calculate weighted average per sample for entire panel if sample not in panel_measurements: @@ -572,7 +621,15 @@ def custom_panel(id): values = [panel_measurements[sample]['measurement'] for sample in panel_measurements] panel_measurements['min'], panel_measurements['max'], panel_measurements['mean'] = get_summary_stats(values) - return render_template('custom_panel.html', form=measurement_type_form, custom_panel=custom_panel, measurement_type=measurement_type, transcript_measurements=transcript_measurements, panel_measurements=panel_measurements, sample_stats=sample_stats) + return render_template( + 'custom_panel.html', + form=measurement_type_form, + custom_panel=custom_panel, + measurement_type=measurement_type, + transcript_measurements=transcript_measurements, + panel_measurements=panel_measurements, + sample_stats=sample_stats + ) @app.route('/panel/custom//transcript/', methods=['GET', 'POST']) @@ -590,13 +647,21 @@ def custom_panel_transcript(id, transcript_name): .options(joinedload(Transcript.exons)) .first() ) - measurement_type = [measurement_type_form.data['measurement_type'], dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type'])] + measurement_type = [ + measurement_type_form.data['measurement_type'], + dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type']) + ] transcript_measurements = {} exon_measurements = {} if sample_ids and measurement_type: # Get transcript measurements - query = TranscriptMeasurement.query.filter(TranscriptMeasurement.sample_id.in_(sample_ids)).filter_by(transcript_id=transcript.id).all() + query = ( + TranscriptMeasurement.query + .filter(TranscriptMeasurement.sample_id.in_(sample_ids)) + .filter_by(transcript_id=transcript.id) + .all() + ) for transcript_measurement in query: sample = transcript_measurement.sample transcript_measurements[sample] = transcript_measurement[measurement_type[0]] @@ -629,7 +694,15 @@ def custom_panel_transcript(id, transcript_name): exon_measurements = get_summary_stats_multi_sample(measurements=exon_measurements) - return render_template('custom_panel_transcript.html', form=measurement_type_form, transcript=transcript, custom_panel=custom_panel, measurement_type=measurement_type, transcript_measurements=transcript_measurements, exon_measurements=exon_measurements) + return render_template( + 'custom_panel_transcript.html', + form=measurement_type_form, + transcript=transcript, + custom_panel=custom_panel, + measurement_type=measurement_type, + transcript_measurements=transcript_measurements, + exon_measurements=exon_measurements + ) @app.route('/panel/custom//gene/', methods=['GET', 'POST']) @@ -642,7 +715,10 @@ def custom_panel_gene(id, gene_id): measurement_type_form = MeasurementTypeForm() sample_ids = [sample.id for sample in custom_panel.samples] - measurement_type = [measurement_type_form.data['measurement_type'], dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type'])] + measurement_type = [ + measurement_type_form.data['measurement_type'], + dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type']) + ] transcript_measurements = {} if sample_ids and measurement_type: @@ -665,7 +741,14 @@ def custom_panel_gene(id, gene_id): transcript_measurements = get_summary_stats_multi_sample(measurements=transcript_measurements) - return render_template('custom_panel_gene.html', form=measurement_type_form, gene=gene, custom_panel=custom_panel, measurement_type=measurement_type, transcript_measurements=transcript_measurements) + return render_template( + 'custom_panel_gene.html', + form=measurement_type_form, + gene=gene, + custom_panel=custom_panel, + measurement_type=measurement_type, + transcript_measurements=transcript_measurements + ) @app.route('/panel/custom//set_validated', methods=['GET', 'POST']) @@ -686,7 +769,11 @@ def custom_panel_validated(id): return redirect(url_for('custom_panel', id=custom_panel.id)) else: - return render_template('custom_panel_validate_confirm.html', form=custom_panel_validate_form, custom_panel=custom_panel) + return render_template( + 'custom_panel_validate_confirm.html', + form=custom_panel_validate_form, + custom_panel=custom_panel + ) @app.route('/sample_set', methods=['GET', 'POST']) @@ -699,9 +786,17 @@ def sample_sets(): if panel_gene_form.validate_on_submit(): if panel_gene_form.panel.data: - return redirect(url_for('sample_set_panel', sample_set_id=panel_gene_form.sample_set.data.id, panel_id=panel_gene_form.panel.data.id)) + return redirect(url_for( + 'sample_set_panel', + sample_set_id=panel_gene_form.sample_set.data.id, + panel_id=panel_gene_form.panel.data.id + )) elif panel_gene_form.gene_id: - return redirect(url_for('sample_set_gene', sample_set_id=panel_gene_form.sample_set.data.id, gene_id=panel_gene_form.gene_id)) + return redirect(url_for( + 'sample_set_gene', + sample_set_id=panel_gene_form.sample_set.data.id, + gene_id=panel_gene_form.gene_id + )) return render_template('sample_sets.html', sample_sets=sample_sets, form=panel_gene_form) @@ -747,14 +842,23 @@ def sample_set(id): } else: panels_measurements[panel]['samples'][sample]['measurement'] = weighted_average( - values=[panels_measurements[panel]['samples'][sample]['measurement'], transcript_measurement[measurement_type[0]]], + values=[ + panels_measurements[panel]['samples'][sample]['measurement'], + transcript_measurement[measurement_type[0]] + ], weights=[panels_measurements[panel]['samples'][sample]['len'], transcript_measurement.len] ) panels_measurements[panel]['samples'][sample]['len'] += transcript_measurement.len panels_measurements = get_summary_stats_multi_sample(measurements=panels_measurements, samples=sample_set.samples) - return render_template('sample_set.html', sample_set=sample_set, form=measurement_type_form, measurement_type=measurement_type, panels_measurements=panels_measurements) + return render_template( + 'sample_set.html', + sample_set=sample_set, + form=measurement_type_form, + measurement_type=measurement_type, + panels_measurements=panels_measurements + ) @app.route('/sample_set//panel/', methods=['GET', 'POST']) @@ -767,7 +871,10 @@ def sample_set_panel(sample_set_id, panel_id): sample_ids = [sample.id for sample in sample_set.samples] transcript_ids = [transcript.id for transcript in panel.transcripts] - measurement_type = [measurement_type_form.data['measurement_type'], dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type'])] + measurement_type = [ + measurement_type_form.data['measurement_type'], + dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type']) + ] transcript_measurements = {} query = ( @@ -790,7 +897,14 @@ def sample_set_panel(sample_set_id, panel_id): # Calculate min, mean, max transcript_measurements = get_summary_stats_multi_sample(measurements=transcript_measurements) - return render_template('sample_set_panel.html', sample_set=sample_set, form=measurement_type_form, measurement_type=measurement_type, panel=panel, transcript_measurements=transcript_measurements) + return render_template( + 'sample_set_panel.html', + sample_set=sample_set, + form=measurement_type_form, + measurement_type=measurement_type, + panel=panel, + transcript_measurements=transcript_measurements + ) @app.route('/sample_set//transcript/', methods=['GET', 'POST']) @@ -807,7 +921,10 @@ def sample_set_transcript(sample_set_id, transcript_name): ) measurement_type_form = MeasurementTypeForm() - measurement_type = [measurement_type_form.data['measurement_type'], dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type'])] + measurement_type = [ + measurement_type_form.data['measurement_type'], + dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type']) + ] exon_measurements = {} # Get exon measurements @@ -833,7 +950,14 @@ def sample_set_transcript(sample_set_id, transcript_name): exon_measurements = get_summary_stats_multi_sample(measurements=exon_measurements) - return render_template('sample_set_transcript.html', form=measurement_type_form, transcript=transcript, sample_set=sample_set, measurement_type=measurement_type, exon_measurements=exon_measurements) + return render_template( + 'sample_set_transcript.html', + form=measurement_type_form, + transcript=transcript, + sample_set=sample_set, + measurement_type=measurement_type, + exon_measurements=exon_measurements + ) @app.route('/sample_set//gene/', methods=['GET', 'POST']) @@ -845,7 +969,10 @@ def sample_set_gene(sample_set_id, gene_id): measurement_type_form = MeasurementTypeForm() sample_ids = [sample.id for sample in sample_set.samples] - measurement_type = [measurement_type_form.data['measurement_type'], dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type'])] + measurement_type = [ + measurement_type_form.data['measurement_type'], + dict(measurement_type_form.measurement_type.choices).get(measurement_type_form.data['measurement_type']) + ] transcript_measurements = {} if sample_ids and measurement_type: @@ -868,4 +995,11 @@ def sample_set_gene(sample_set_id, gene_id): transcript_measurements = get_summary_stats_multi_sample(measurements=transcript_measurements) - return render_template('sample_set_gene.html', form=measurement_type_form, gene=gene, sample_set=sample_set, measurement_type=measurement_type, transcript_measurements=transcript_measurements) + return render_template( + 'sample_set_gene.html', + form=measurement_type_form, + ene=gene, + sample_set=sample_set, + measurement_type=measurement_type, + transcript_measurements=transcript_measurements + )