From 0cf1f6287f59eb9adc69b9c50b6fee241b843361 Mon Sep 17 00:00:00 2001 From: James Kent Date: Mon, 4 Nov 2024 16:43:46 -0600 Subject: [PATCH] wip: add feature tables --- store/neurostore/models/data.py | 51 +++++++++++++++++++++++++++++- store/neurostore/tests/conftest.py | 27 ++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/store/neurostore/models/data.py b/store/neurostore/models/data.py index a3866d21..41d37118 100644 --- a/store/neurostore/models/data.py +++ b/store/neurostore/models/data.py @@ -278,7 +278,7 @@ class Study(BaseMixin, db.Model): public = db.Column(db.Boolean, default=True) level = db.Column(db.String) metadata_ = db.Column(JSONB) - source = db.Column(db.String, index=True) + source = db.Column(db.String, index=True)base_study_id = db.Column(db.Text, db.ForeignKey("base_studies.id"), index=True) source_id = db.Column(db.String, index=True) source_updated_at = db.Column(db.DateTime(timezone=True)) base_study_id = db.Column(db.Text, db.ForeignKey("base_studies.id"), index=True) @@ -538,6 +538,55 @@ class PointValue(BaseMixin, db.Model): user = relationship("User", backref=backref("point_values", passive_deletes=True)) +class Pipeline(BaseMixin, db.Model): + __tablename__ = "pipelines" + + name = db.Column(db.String) + description = db.Column(db.String) + version = db.Column(db.String) + study_depenedent = db.Column(db.Boolean, default=False) + ace_compatible = db.Column(db.Boolean, default=False) + pubget_compatible = db.Column(db.Boolean, default=False) + derived_from = db.Column(db.Text) + + +class PipelineConfig(BaseMixin, db.Model): + __tablename__ = "pipeline_configs" + + pipeline_id = db.Column( + db.Text, db.ForeignKey("pipelines.id", ondelete="CASCADE"), index=True + ) + config = db.Column(JSONB) + config_hash = db.Column(db.String, index=True) + pipeline = relationship("Pipeline", backref=backref("configs", passive_deletes=True)) + + +class PipelineRun(BaseMixin, db.Model): + __tablename__ = "pipeline_runs" + + pipeline_id = db.Column( + db.Text, db.ForeignKey("pipelines.id", ondelete="CASCADE"), index=True + ) + config_id = db.Column( + db.Text, db.ForeignKey("pipeline_configs.id", ondelete="CASCADE"), index=True + ) + config = relationship("PipelineConfig", backref=backref("runs", passive_deletes=True)) + run_index = db.Column(db.Integer()) + + +class PipelineRunResult(BaseMixin, db.Model): + __tablename__ = "pipeline_run_results" + + run_id = db.Column( + db.Text, db.ForeignKey("pipeline_runs.id", ondelete="CASCADE"), index=True + ) + base_study_id = db.Column(db.Text, db.ForeignKey("base_studies.id"), index=True) + feature_index = db.Column(db.Integer) # the same categories of information can be extracted multiple times from a single paper (e.g., multiple demographic groups, multiple software packages, etc) + feature_group = db.Column(db.String) # task, disease, software, age + feature = db.Column(db.String) # stroop task, schizophrenia, fsl + value = db.Column(db.Float) # 0.67, 0.3, 0.5 (some measure of confidence for the result) + run = relationship("PipelineRun", backref=backref("results", passive_deletes=True)) + # from . import event_listeners # noqa E402 # del event_listeners diff --git a/store/neurostore/tests/conftest.py b/store/neurostore/tests/conftest.py index 099ff129..7b896383 100644 --- a/store/neurostore/tests/conftest.py +++ b/store/neurostore/tests/conftest.py @@ -1,4 +1,6 @@ import pytest +import random +import json from os import environ from neurostore.models.data import Analysis, Condition from sqlalchemy.orm import scoped_session, sessionmaker @@ -586,3 +588,28 @@ def simple_neurosynth_annotation(session, ingest_neurosynth): session.commit() return smol_annot + + +@pytest.fixture(scope="function") +def create_demographic_features(session, ingest_neurosynth, tmp_path): + output_dir = tmp_path / "output" / "demographics" / "v1.0.0" + output_dir.mkdir(exist_ok=True, parents=True) + studies = Study.query.all() + diseases = ["schizophrenia", "bipolar disorder", "depression", "healthy"] + studies_data = [ + [ + { + "age": random.randint(18, 100), + "group": group + } for group in random.sample(diseases, k=random.randint(1, 2)) + ] for study in studies + ] + + for study, study_data in zip(studies, studies_data): + study_dir = output_dir / study.id + with open(study_dir / "results.json", "w") as f: + for entry in study_data: + json.dump(entry, f) + f.write('\n') + + return output_dir