diff --git a/.dockerignore b/.dockerignore new file mode 100755 index 00000000..60baa9cb --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +data/* diff --git a/.gitignore b/.gitignore index 3b791328..a2c1cd7e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ + # some unused files +rasters/* +notebooks/.* notebooks/notebooks_plotting/* scripts/update_entries.py scripts/compare_tables.py diff --git a/Dockerfile b/Dockerfile old mode 100755 new mode 100644 index 71b48017..b5a37402 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,17 @@ -FROM datajoint/jupyter:python3.6 +FROM datajoint/djlab:py3.7-debian +RUN pip install --upgrade pip RUN pip install --upgrade datajoint ADD . /src/IBL-pipeline - +USER root RUN pip install -e /src/IBL-pipeline -RUN pip install globus_sdk -RUN pip install plotly -RUN pip install statsmodels -RUN pip install scikits.bootstrap - -RUN pip install ibllib -RUN pip install "git+https://github.com/ixcat/djwip.git#egg=djwip" - -ADD ./allen_structure_tree.csv /usr/local/lib/python3.6/dist-packages/ibllib/atlas +RUN pip uninstall opencv-python -y +RUN conda install -c conda-forge opencv -y +COPY --chown=dja:anaconda ./apt_requirements.txt /tmp/apt_requirements.txt +RUN apt update +USER dja:anaconda +RUN \ + /entrypoint.sh echo "Requirements updated..." && \ + rm "${APT_REQUIREMENTS}" diff --git a/README.md b/README.md index b3e5add2..5cb76d10 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,89 @@ -# Getting started with DataJoint for IBL # -1. Email shanshen@vathes.com for a database username. +# Identify your role + +This is important to identify how you would work with IBL-pipeline. There are two typical roles: + +1. User: +>* Internal user: an IBL user who would like to utilize the IBL-pipeline to query the IBL database for research and create downstream tables for their own analyses in the IBL database, but a user will not contribute to the development of the main IBL pipeline. +>* External user: similar to an internal user, but an external user will not use IBL database to access data, but would like to adopt the database schemas and tables from IBL pipeline. + +2. Developer: besides the actions of the users, a developer would like to contribute to the daily ingestion, computation, and plotting of the IBL-pipeline. + + +# Instruction for users + +1. Get credentials to the database server +> For an IBL internal user, contact Shan Shen via shanshen@vathes.com or Slack for a username and an initial password to the IBL database. You can change your password with +``` +import datajoint as dj +dj.set_password() +``` +> For an external user, set up your own database server and here is an [instruction](https://docs.datajoint.io/python/admin/1-hosting.html). + +2. Install IBL-pipeline python package + +> Install the package with pip, this gives the latest version. Use pip3 instead of pip does not work properly. + +``` +pip install ibl-pipeline +``` +> To upgrade to the latest version, +``` +pip install --upgrade ibl-pipeline +``` +> After the installation, `datajoint` and `ibl_pipeline` could be imported as regular modules +3. Set up the configuration of DataJoint. +> Now you have successfully installed datajoint and ibl_pipeline package, to properly connect to the database server, set up the configuration by specifying dj.config. + +``` +shanshen@Shans-MacBook-Pro:~$ ipython +In [1]: import datajoint as dj + +In [2]: dj.config +Out[2]: +{ 'connection.charset': '', + 'connection.init_function': None, + 'database.host': 'localhost', + 'database.password': None, + 'database.port': 3306, + 'database.reconnect': True, + 'database.user': None, + 'display.limit': 12, + 'display.show_tuple_count': True, + 'display.width': 14, + 'fetch_format': 'array', + 'loglevel': 'INFO', + 'safemode': True} +``` +> The default value of dj.config is shown as above. You will need to change the fields: +``` +dj.config['database.host'] = 'datajoint.internationalbrainlab.org' +dj.config['database.user'] = 'YOUR_USERNAME' +dj.config['database.password'] = 'YOUR_PASSWORD' +``` + +> Then save the configuration as a json file with either dj.config.save_local(), or dj.config.save_global(). If saved globally, this configuration will be applied in all directories. If saved locally, it only applies when you under your current directory. The configuration will be saved as a json file dj_local_conf.json in the current directory. You don’t need to set up the configuration the next time. + +> You can start using ibl_pipeline by importing modules, such as: + +``` +from ibl_pipeline import reference, subject, action, acquisition, data, behavior, ephys, histology +``` + +4. Special notes: the IBL-pipeline is under active development, the tables of interests may have already existed in the database before the latest version of ibl-pipeline is released. To get access to the latest tables, we also recommend using `dj.create_virtual_module`. The syntax to create a virtual module is as follows: +``` +behavior = dj.create_virtual_module('behavior', 'ibl_behavior') +``` + +> Then `behavior` could be used to access any table: + +``` +behavior.TrialSet() +``` + +# Instruction for developers + +1. Email shanshen@vathes.com for a database username and initial password. 2. Install Docker (https://www.docker.com/). Linux users also need to install Docker Compose separately. For Mac: https://docs.docker.com/docker-for-mac/. @@ -19,27 +102,26 @@ If you don't have SSH setup, use `git clone https://github.com/YourUserName/IBL- DJ_PASS=password ``` -6. Now let's set up the docker container that have the entire environment. +6. Now let's set up the docker container that have the entire environment. -Copy `docker-compose-template.yml` as `docker-compose.yml` - this is your own file you can customize. +> Copy `docker-compose-template.yml` as `docker-compose.yml` - this is your own file you can customize. -Note: There is a similar file called `docker-compose-local_template.yml`. You will not need it unless you would like to perform ingestion from scratch in the database hosted on your own machine. +> Note: There is a similar file called `docker-compose-local_template.yml`. You will not need it unless you would like to perform ingestion from scratch in the database hosted on your own machine. -There are two properties that you may want to customize. +> There are two properties that you may want to customize. -First, to save figures in a folder outside your `IBL-pipeline` docker folder (which is good practice so you don't clutter up the Github repo), you can tell Docker to create an alias older which points to your preferred place for storing figures. +> First, to save figures in a folder outside your `IBL-pipeline` docker folder (which is good practice so you don't clutter up the Github repo), you can tell Docker to create an alias older which points to your preferred place for storing figures. a. `open docker-compose.yml` - - b. add `myFullPath:/Figures_DataJoint_shortcuts` in to the `volumes:`, where `myFullPath` could for example be `~/Google Drive/Rig building WG/DataFigures/BehaviourData_Weekly/Snapshot_DataJoint/` - - c. close the file -Then save the plots from Python into `/Figures_DataJoint_shortcuts` inside the docker, then you’ll see that the plots are in the folder you want. + b. add any folder you would like to access within the docker container in to the `volumes:` + for example '~/Documents/ephys_data:/ephys_data' + + c. close the file -Second, Set up your `.one_params`. +> Second, Set up your `.one_params`. -If you have your `.one_params` in your root directory `~/.one_params`, you can directly go to Ste[ 7]. If you have your `.one_params` in another directory, please change the mapping `docker-compose.yml` +> If you have your `.one_params` in your root directory `~/.one_params`, you can directly go to Step 7. If you have your `.one_params` in another directory, please change the mapping `docker-compose.yml` in the `volumes:` section `your-directory-to-one_params/.one_params: /root/.one_params`. After your are done with these customization, you are ready to start the docker container, by running: @@ -52,9 +134,9 @@ Note: Anytime you would like to change the mapping from an outside folder to a d ## To run your own Python scripts ## -7. After running the docker container, you may want to use enter the container to run your own script. The command is `docker exec -it ibl-pipeline_datajoint_1 /bin/bash`. You would then enter the container with the current directory `/notebooks`. You can use `cd` to navigate inside the docker container. +7. After running the docker container, you may want to use enter the container to run your own script. The command is `docker exec -it ibl-pipeline_datajoint_1 /bin/bash`. You would then enter the container with the current directory `/notebooks`. You can use `cd` to navigate inside the docker container. - Note: If you would like to go to a specific folder, for example `prelim_analyses/behavioral_snapshots`at the same time when you run `docker exec`, you can use this command line: `docker exec -it docker exec -it ibl-pipeline_datajoint_1 bash -c "cd /src/IBL-pipeline/prelim_analyses/behavioral_snapshots; exec /bin/bash"` +> Note: If you would like to go to a specific folder, for example `prelim_analyses/behavioral_snapshots`at the same time when you run `docker exec`, you can use this command line: `docker exec -it docker exec -it ibl-pipeline_datajoint_1 bash -c "cd /src/IBL-pipeline/prelim_analyses/behavioral_snapshots; exec /bin/bash"` 8. To simplify the process of setting up the docker environment, we prepared a bash script `ibl_docker_setup-template.sh`. You may first want to copy this template by `cp ibl_docker_setup-template.sh ibl_docker_setup.sh`, then customize your own `ibl_docker_setup.sh`. In the file, you can change the directory you want to go to in the last line. The default command in the last line is: `docker exec -it docker exec -it ibl-pipeline_datajoint_1 bash -c "cd /src/IBL-pipeline/prelim_analyses/; exec /bin/bash"`, which goes to the folder `IBL-pipeline/prelim_analyses`. You can replace this directory with the directory you would like to go to. @@ -75,73 +157,52 @@ python behavioral_snapshot.py 10. Go to http://localhost:8888/tree in your favorite browser to open Jupyter Notebook. -11. Open "Datajoint pipeline query tutorial.ipynb". +11. Open the directory `notebooks_tutorial` and feel free to go to through the tutorials. -12. Run through the notebook and feel free to experiment. ### Staying up-to date ### -To stay up-to-date with the latest code from DataJoint, you might first want to check by `git remote -v`. +To stay up-to-date with the latest code from DataJoint, you might first want to check by `git remote -v`. If there is no upstream pointing to the int-brain-lab repository, then do `git remote add upstream https://github.com/int-brain-lab/IBL-pipeline`. Then `git pull upstream master` will make sure that your local fork stays up to date with the original repo. #### Contributing code #### -If you feel happy with the changes you've made, you can add, commit and push them to your own branch. Then go to https://github.com/int-brain-lab/IBL-pipeline, click 'Pull requests', 'New pull request', 'compare across forks', and select your fork of `IBL-pipeline`. If there are no merge conflicts, you can click 'Create pull request', explain what changes/contributions you've made, and and submit it to the DataJoint team for approval. - - - ---- - -# Instructions to ingest Alyx data into local database # - -To run an local instance of database in the background, run the docker-compose command as follows: - -```bash -docker-compose -f docker-compose-local.yml up -d -``` - -This will create a docker container with a local database inside. To access the docker from the terminal, first get the docker container ID with `docker ps`, then run: +If you feel happy with the changes you've made, you can add, commit and push them to your own branch. Then go to https://github.com/int-brain-lab/IBL-pipeline, click 'Pull requests', 'New pull request', 'compare across forks', and select your fork of `IBL-pipeline`. If there are no merge conflicts, you can click 'Create pull request', explain what changes/contributions you've made, and and submit it to the DataJoint team for approval. -```bash -docker exec -it CONTAINER_ID /bin/bash -``` -Now we are in the docker, and run the bash script for the ingestion: -``` -bash /src/ibl-pipeline/scripts/ingest_alyx.sh ../data/alyx_dump/2018-10-30_alyxfull.json -``` - -Make sure that the json file is in the correct directory as shown above. +# IBL pipeline schemas # -To turn stop the containers, run: +Schema of `reference`: +![Reference Diagram](images/ephys.png) -```bash -docker-compose -f docker-compose-local.yml down -``` +Schema of `subject`: +![Subject Diagram](images/subject.png) -# Instructions to ingest Alyx data into Amazon RDS +Schema of `action`: +![Action Diagram](images/action.png) -To insert Alyx data into the remote Amazon RDS, create a .env file in the same directory of your `docker-compose.yml`, as instructed in Step 4 above. +Schema of `acquisition`: +![Acquisition Diagram](images/acquisition.png) -Now run the docker-compose as follows, it will by default run through the file `docker-compose.yml` +Schema of `data`: +![DataDiagram](images/data.png) -```bash -docker-compose up -d -``` +Schema of `behavior` +![Behavior erd](images/behavior.png) -This will create a docker container and link to the remote Amazon RDS. Then follow the same instruction of ingestion to the local database. +Schema of `behavior_analyses`: +![Behavior analyses Diagram](images/behavior_analyses.png) -# IBL pipeline schemas # +Schema of `ephys` +![Ephys erd](images/ephys.png) -Alyx-corresponding schemas, including, `referenall_erd.save('/images/all_erd.png')ce`, `subject`, `action`, `acquisition`, and `data` +Schema of `histology`: +![Histology Diagram](images/histology.png) -![Alyx_corresponding erd](images/alyx_erd.png) +Schema of `qc`: -Schema of `ephys` -![Ephys erd](images/ephys_erd.png) -Schema of `behavior` -![Behavior erd](images/behavior_erd.png) +![Quality check Diagram](images/qc.png) diff --git a/apt_requirements.txt b/apt_requirements.txt new file mode 100644 index 00000000..f359f073 --- /dev/null +++ b/apt_requirements.txt @@ -0,0 +1 @@ +libgl1-mesa-glx diff --git a/docker-compose-db-test.yml b/docker-compose-db-test.yml index b2ee3222..73c71004 100644 --- a/docker-compose-db-test.yml +++ b/docker-compose-db-test.yml @@ -5,10 +5,15 @@ services: container_name: ibl_datajoint_dbtest env_file: .env_dbtest volumes: - - ./notebooks:/notebooks + - ./notebooks:/home/dja - ./images:/images - .:/src/IBL-pipeline - ./data:/data - ./root/.one_params:/root/.one_params + user: 1000:anaconda ports: - - "8400:8888" + - "8920:8888" + networks: + - ibl_dbtest +networks: + ibl_dbtest: diff --git a/docker-compose-local-template.yml b/docker-compose-local-template.yml index 0fb81a35..c4f4cd95 100644 --- a/docker-compose-local-template.yml +++ b/docker-compose-local-template.yml @@ -7,19 +7,24 @@ services: - DJ_USER=root - DJ_PASS=simple volumes: - - ./notebooks:/notebooks + - ./notebooks:/home/dja - ./images:/images - .:/src/IBL-pipeline - ./data:/data - ./root/.one_params:/root/.one_params - - ./snapshots:/Snapshot_DataJoint_shortcut links: - db + user: 1000:anaconda ports: - "8888:8888" + networks: + - ibl_local + db: image: datajoint/mysql environment: - MYSQL_ROOT_PASSWORD=simple ports: - "4306:3306" +networks: + ibl_local diff --git a/docker-compose-public.yml b/docker-compose-public.yml index d0e85337..3fe25676 100644 --- a/docker-compose-public.yml +++ b/docker-compose-public.yml @@ -5,11 +5,15 @@ services: container_name: ibl_datajoint_public env_file: .env_public volumes: - - ./notebooks:/notebooks + - ./notebooks:/home/dja - ./images:/images - .:/src/IBL-pipeline - ./data:/data - ./root/.one_params:/root/.one_params - - ./snapshots:/Figures_DataJoint_shortcuts + user: 1000:anaconda ports: - "8300:8888" + networks: + - ibl_public +networks: + ibl_public: diff --git a/docker-compose-template.yml b/docker-compose-template.yml index 8113a060..0a87f3c4 100644 --- a/docker-compose-template.yml +++ b/docker-compose-template.yml @@ -4,11 +4,15 @@ services: build: . env_file: .env volumes: - - ./notebooks:/notebooks + - ./notebooks:/home/dja - ./images:/images - .:/src/IBL-pipeline - ./data:/data - - ~/.one_params:/root/.one_params - - ./snapshots:/Figures_DataJoint_shortcuts + - ./root/.one_params:/home/dja/.one_params + user: 1000:anaconda ports: - "8888:8888" + networks: + - ibl +networks: + ibl: diff --git a/docker-compose-test-new.yml b/docker-compose-test-new.yml new file mode 100644 index 00000000..2b6c3fa3 --- /dev/null +++ b/docker-compose-test-new.yml @@ -0,0 +1,17 @@ +version: '3' +services: + datajoint_test_new: + build: + context: . + dockerfile: Dockerfile_new + container_name: ibl_datajoint_test_new + env_file: .env_test + volumes: + - ./notebooks:/home/dja + - ./images:/images + - .:/src/IBL-pipeline + - ./data:/data + - ./root/.one_params:/home/dja/.one_params + user: 1000:anaconda + ports: + - "9999:8888" diff --git a/docker-compose-test.yml b/docker-compose-test.yml index 04bcaebb..f09bf7d4 100644 --- a/docker-compose-test.yml +++ b/docker-compose-test.yml @@ -5,10 +5,15 @@ services: container_name: ibl_datajoint_test env_file: .env_test volumes: - - ./notebooks:/notebooks + - ./notebooks:/home/dja - ./images:/images - .:/src/IBL-pipeline - ./data:/data - ./root/.one_params:/root/.one_params + user: 1000:anaconda ports: - - "8900:8888" + - "9999:8888" + networks: + - ibl_test +networks: + ibl_test: diff --git a/docker-compose-updates.yml b/docker-compose-updates.yml index d4ef88c0..f0e9f9f7 100644 --- a/docker-compose-updates.yml +++ b/docker-compose-updates.yml @@ -5,11 +5,16 @@ services: container_name: ibl_datajoint env_file: .env_update volumes: - - ./notebooks:/notebooks + - ./notebooks:/home/dja - ./images:/images - .:/src/IBL-pipeline - ./data:/data - ./root/.one_params:/root/.one_params - - ./snapshots:/Figures_DataJoint_shortcuts + user: 1000:anaconda ports: - "8902:8888" + networks: + - ibl_update + +networks: + ibl_update: diff --git a/ibl_pipeline/__init__.py b/ibl_pipeline/__init__.py old mode 100644 new mode 100755 index b36aa9ff..37fe503c --- a/ibl_pipeline/__init__.py +++ b/ibl_pipeline/__init__.py @@ -44,5 +44,9 @@ class S3Access(dj.Manual): secret_key=secret_key, bucket='ibl-dj-external', location='/plotting' + ), + 'ephys_local': dict( + protocol='file', + location='/data/ephys' ) } diff --git a/ibl_pipeline/acquisition.py b/ibl_pipeline/acquisition.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/action.py b/ibl_pipeline/action.py old mode 100644 new mode 100755 index adfb10f4..cd25492e --- a/ibl_pipeline/action.py +++ b/ibl_pipeline/action.py @@ -183,3 +183,40 @@ class OtherActionProcedure(dj.Manual): procedure_type_name: varchar(255) otheractionprocedure_ts=CURRENT_TIMESTAMP: timestamp """ + + +@schema +class CullMethod(dj.Lookup): + definition = """ + cull_method: varchar(64) + --- + cull_method_uuid: uuid + cull_method_description='': varchar(255) + cull_method_ts=CURRENT_TIMESTAMP: timestamp + """ + + +@schema +class CullReason(dj.Lookup): + definition = """ + cull_reason: varchar(64) + --- + cull_reason_uuid: uuid + cull_reason_description='': varchar(255) + cull_reason_ts=CURRENT_TIMESTAMP: timestamp + """ + + +@schema +class Cull(dj.Manual): + definition = """ + subject_uuid: uuid + --- + cull_uuid: uuid + cull_date: date + cull_user=null: varchar(255) + cull_reason=null: varchar(64) + cull_method=null: varchar(64) + cull_description='': varchar(1024) + cull_ts=CURRENT_TIMESTAMP: timestamp + """ diff --git a/ibl_pipeline/analyses/__init__.py b/ibl_pipeline/analyses/__init__.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/analyses/analysis_utils.py b/ibl_pipeline/analyses/analysis_utils.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/analyses/behavior.py b/ibl_pipeline/analyses/behavior.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/analyses/end_session_criteria.py b/ibl_pipeline/analyses/end_session_criteria.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/analyses/ephys.py b/ibl_pipeline/analyses/ephys.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/behavior.py b/ibl_pipeline/behavior.py index f0728c79..9730c8dd 100755 --- a/ibl_pipeline/behavior.py +++ b/ibl_pipeline/behavior.py @@ -9,7 +9,7 @@ try: from oneibl.one import ONE import alf.io - one = ONE() + one = ONE(silent=True) except ImportError: warnings.warn('ONE not installed, cannot use populate') pass @@ -124,7 +124,8 @@ def make(self, key): wheel_position, wheel_velocity, wheel_timestamps = \ one.load(eID, dataset_types=['wheel.position', 'wheel.velocity', - 'wheel.timestamps']) + 'wheel.timestamps'], + ) wheel_sampling_rate = 1 / np.median(np.diff(wheel_timestamps)) @@ -432,12 +433,17 @@ def make(self, key): eID, dataset_types='trials.stimOn_times', clobber=True) - if np.all(np.isnan(stimOn_times)): - key['stim_on_times_status'] = 'Missing' - elif np.any(np.isnan(stimOn_times)): - key['stim_on_times_status'] = 'Partial' + if stimOn_times is not None and len(stimOn_times): + if (len(stimOn_times)==1 and stimOn_times[0] is None) or \ + np.all(np.isnan(np.array(stimOn_times))): + key['stim_on_times_status'] = 'Missing' + elif np.any(np.isnan(np.array(stimOn_times))): + key['stim_on_times_status'] = 'Partial' + else: + key['stim_on_times_status'] = 'Complete' else: - key['stim_on_times_status'] = 'Complete' + key['stim_on_times_status'] = 'Missing' + if '_ibl_trials.repNum.npy' not in datasets: key['rep_num_status'] = 'Missing' @@ -490,8 +496,8 @@ class TrialSet(dj.Imported): """ # Knowledge based hack to be formalized better later - if not environ.get('MODE') == 'test': - key_source = acquisition.Session & CompleteTrialSession + # if not environ.get('MODE') == 'test': + key_source = acquisition.Session & CompleteTrialSession def make(self, key): @@ -519,7 +525,7 @@ def make(self, key): eID, dataset_types=dtypes, download_only=True, clobber=True) ses_path = alf.io.get_session_path(files[0]) trials = alf.io.load_object( - ses_path.joinpath('alf'), '_ibl_trials') + ses_path.joinpath('alf'), 'trials') status = (CompleteTrialSession & key).fetch1() diff --git a/ibl_pipeline/common.py b/ibl_pipeline/common.py old mode 100644 new mode 100755 index dcad6041..1361e03b --- a/ibl_pipeline/common.py +++ b/ibl_pipeline/common.py @@ -1,5 +1,11 @@ -from ibl_pipeline import reference, subject, action, acquisition, behavior, ephys, histology +from ibl_pipeline import reference, subject, action, acquisition, data, behavior from ibl_pipeline.analyses import behavior as behavior_analyses -from ibl_pipeline.analyses import ephys as ephys_analyses from ibl_pipeline.plotting import behavior as behavior_plotting -from ibl_pipeline.plotting import ephys as ephys_plotting + +from os import environ + +mode = environ.get('MODE') +if mode != 'public': + from ibl_pipeline import ephys, histology + from ibl_pipeline.analyses import ephys as ephys_analyses + from ibl_pipeline.plotting import ephys as ephys_plotting diff --git a/ibl_pipeline/data.py b/ibl_pipeline/data.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/ephys.py b/ibl_pipeline/ephys.py index e5867ad4..242c2ff0 100755 --- a/ibl_pipeline/ephys.py +++ b/ibl_pipeline/ephys.py @@ -64,23 +64,25 @@ class CompleteClusterSession(dj.Computed): 'spikes.clusters.npy', 'spikes.depths.npy', 'spikes.samples.npy', - 'spikes.templates.npy', - 'spikes.times.npy' + 'spikes.templates.npy' ] key_source = acquisition.Session & \ 'task_protocol like "%ephysChoiceWorld%"' \ - & (data.FileRecord & 'dataset_name="spikes.times.npy"') \ + & (data.FileRecord & 'dataset_name like "%spikes.times%.npy"') \ & (data.FileRecord & 'dataset_name="spikes.clusters.npy"') \ & (data.FileRecord & 'dataset_name="probes.description.json"') def make(self, key): + datasets = (data.FileRecord & key & 'repo_name LIKE "flatiron_%"' & {'exists': 1}).fetch('dataset_name') is_complete = bool(np.all([req_ds in datasets - for req_ds in self.required_datasets])) + for req_ds in self.required_datasets])) \ + and bool(np.any(['spikes.times' in d for d in datasets])) + if is_complete: self.insert1(key) - (EphysMissingDataLog & key).delete() + (EphysMissingDataLog & key).delete_quick() else: for req_ds in self.required_datasets: if req_ds not in datasets: @@ -194,6 +196,11 @@ class DefaultCluster(dj.Imported): def make(self, key): eID = str((acquisition.Session & key).fetch1('session_uuid')) + spikes_times_dtype_name = ( + data.FileRecord & key & + 'dataset_name like "%spikes.times%.npy"').fetch1( + 'dataset_name').split('.npy')[0] + dtypes = [ 'clusters.amps', 'clusters.channels', @@ -208,7 +215,7 @@ def make(self, key): 'spikes.depths', 'spikes.samples', 'spikes.templates', - 'spikes.times' + spikes_times_dtype_name ] files = one.load(eID, dataset_types=dtypes, download_only=True, @@ -222,7 +229,14 @@ def make(self, key): spikes = alf.io.load_object( ses_path.joinpath('alf', probe_name), 'spikes') - max_spike_time = spikes.times[-1] + time_fnames = [k for k in spikes.keys() if 'times' in k] + + if len(time_fnames) > 1: + raise ValueError('More than one fields of spikes are about times: {}'.format(spikes.keys())) + else: + time_fname = time_fnames[0] + + max_spike_time = spikes[time_fname][-1] for icluster, cluster_uuid in tqdm(enumerate(clusters.uuids['uuids']), position=0): @@ -238,7 +252,7 @@ def make(self, key): cluster_waveforms_channels=clusters.waveformsChannels[icluster], cluster_depth=clusters.depths[icluster], cluster_peak_to_trough=clusters.peakToTrough[icluster], - cluster_spikes_times=spikes.times[idx], + cluster_spikes_times=spikes[time_fname][idx], cluster_spikes_depths=spikes.depths[idx], cluster_spikes_amps=spikes.amps[idx], cluster_spikes_templates=spikes.templates[idx], @@ -268,7 +282,7 @@ def make(self, key): [dict(**key, cluster_id=icluster, metric_name=name, metric_value=value) for name, value in metrics.to_dict().items() - if name != 'ks2_label' and not np.isnan(value)] + if name != 'ks2_label' and not np.isnan(value) and not np.isinf(value)] ) class Metric(dj.Part): @@ -319,6 +333,7 @@ class GoodCluster(dj.Computed): --- is_good=0: bool # whether the unit is good """ + def make(self, key): firing_rate = (DefaultCluster.Metrics & key).fetch1('firing_rate') @@ -391,7 +406,7 @@ def make(self, key): trial_spike_time = spike_times[spike_ids == itrial*2+1] if not len(trial_spike_time): - trial_spk['trial_spike_times'] = [] + trial_spk['trial_spike_times'] = np.array([]) else: if event == 'stim on': trial_spk['trial_spike_times'] = \ @@ -405,7 +420,8 @@ def make(self, key): trial_spike_time - trial_feedback_times[itrial] else: continue - trial_spk['event'] = event - trial_spks.append(trial_spk.copy()) + + trial_spk['event'] = event + trial_spks.append(trial_spk.copy()) self.insert(trial_spks) diff --git a/ibl_pipeline/group_shared/__init__.py b/ibl_pipeline/group_shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ibl_pipeline/group_shared/wheel.py b/ibl_pipeline/group_shared/wheel.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/histology.py b/ibl_pipeline/histology.py old mode 100644 new mode 100755 index cf1a87b7..473172c4 --- a/ibl_pipeline/histology.py +++ b/ibl_pipeline/histology.py @@ -5,6 +5,7 @@ from os import path, environ import numpy as np from .utils import atlas +import pdb try: from ibllib.pipes.ephys_alignment import EphysAlignment @@ -22,6 +23,7 @@ @schema class InsertionDataSource(dj.Lookup): definition = """ + # Method to estimate the probe trajectory, including Ephys aligned histology track, Histology track, Micro-manipulator, and Planned insertion_data_source: varchar(128) # type of trajectory --- provenance: int # provenance code @@ -37,7 +39,7 @@ class InsertionDataSource(dj.Lookup): @schema class ProbeTrajectory(dj.Imported): definition = """ - # data imported from probes.trajectory + # Probe trajectory estimated with each method, ingested from Alyx table experiments.probetrajectory -> ephys.ProbeInsertion -> InsertionDataSource --- @@ -69,6 +71,7 @@ def make(self, key): @schema class ChannelBrainLocation(dj.Imported): definition = """ + # Brain coordinates and region assignment of each channel, ingested from Alyx table experiments.channel -> ProbeTrajectory channel_brain_location_uuid : uuid --- @@ -84,6 +87,7 @@ class ChannelBrainLocation(dj.Imported): @schema class ClusterBrainRegion(dj.Computed): definition = """ + # Brain region assignment to each cluster -> ephys.DefaultCluster -> InsertionDataSource --- @@ -112,6 +116,14 @@ def make(self, key): 'ontology', 'acronym') self.insert1(key) + elif len(q) > 1: + ontology, acronym = q.fetch('ontology', 'acronym') + if len(set(acronym)) == 1: + key['ontology'] = 'CCF 2017' + key['acronym'] = acronym[0] + self.insert1(key) + else: + print('Conflict regions') else: return @@ -119,6 +131,8 @@ def make(self, key): @schema class SessionBrainRegion(dj.Computed): definition = """ + # Brain regions assignment to each session + # including the regions of finest granularity and their upper-level areas. -> acquisition.Session -> reference.BrainRegion """ @@ -138,6 +152,7 @@ def make(self, key): @schema class DepthBrainRegion(dj.Computed): definition = """ + # For each ProbeTrajectory, assign depth boundaries relative to the probe tip to each brain region covered by the trajectory -> ProbeTrajectory --- region_boundaries : blob @@ -159,3 +174,163 @@ def make(self, key): xyz_channels.astype('float')/1e6, axial.astype('float')) self.insert1(key) + + +# ================= The following tables will replace the above ones eventually ==================== + +@schema +class Provenance(dj.Lookup): + definition = """ + # Method to estimate the probe trajectory, including Ephys aligned histology track, Histology track, Micro-manipulator, and Planned + provenance : tinyint unsigned # provenance code + --- + provenance_description : varchar(128) # type of trajectory + """ + contents = [ + (70, 'Ephys aligned histology track'), + (50, 'Histology track'), + (30, 'Micro-manipulator'), + (10, 'Planned'), + ] + + +@schema +class ProbeTrajectoryTemp(dj.Imported): + definition = """ + # Probe trajectory estimated with each method, ingested from Alyx table experiments.trajectoryestimate + -> ephys.ProbeInsertion + -> Provenance + --- + -> [nullable] reference.CoordinateSystem + probe_trajectory_uuid: uuid + x: float # (um) medio-lateral coordinate relative to Bregma, left negative + y: float # (um) antero-posterior coordinate relative to Bregma, back negative + z: float # (um) dorso-ventral coordinate relative to Bregma, ventral negative + phi: float # (degrees)[-180 180] azimuth + theta: float # (degrees)[0 180] polar angle + depth: float # (um) insertion depth + roll=null: float # (degrees) roll angle of the probe + trajectory_ts: datetime + """ + keys = histology_ingest.ProbeTrajectory.fetch( + 'subject_uuid', 'session_start_time', 'probe_idx', + 'insertion_data_source', as_dict=True) + key_source = ephys.ProbeInsertion * InsertionDataSource & keys + + def make(self, key): + + trajs = (histology_ingest.ProbeTrajectory & key).fetch(as_dict=True) + for traj in trajs: + if not traj['coordinate_system_name']: + traj.pop('coordinate_system_name') + self.insert1(traj, skip_duplicates=True) + + +@schema +class ChannelBrainLocationTemp(dj.Imported): + definition = """ + # Brain coordinates and region assignment of each channel, ingested from Alyx table experiments.channel + -> ProbeTrajectoryTemp + channel_brain_location_uuid : uuid + --- + channel_axial : decimal(6, 1) + channel_lateral : decimal(6, 1) + channel_x : decimal(6, 1) + channel_y : decimal(6, 1) + channel_z : decimal(6, 1) + -> reference.BrainRegion + """ + + +@schema +class DepthBrainRegionTemp(dj.Computed): + definition = """ + # For each ProbeTrajectory, assign depth boundaries relative to the probe tip to each brain region covered by the trajectory + -> ProbeTrajectoryTemp + --- + region_boundaries : blob + region_label : blob + region_color : blob + region_id : blob + """ + key_source = ProbeTrajectoryTemp & ChannelBrainLocationTemp + + def make(self, key): + + x, y, z, axial = (ChannelBrainLocationTemp & key).fetch( + 'channel_x', 'channel_y', 'channel_z', 'channel_axial', + order_by='channel_axial') + xyz_channels = np.c_[x, y, z] + key['region_boundaries'], key['region_label'], \ + key['region_color'], key['region_id'] = \ + EphysAlignment.get_histology_regions( + xyz_channels.astype('float')/1e6, axial.astype('float')) + + self.insert1(key) + + +@schema +class ClusterBrainRegionTemp(dj.Computed): + definition = """ + # Brain region assignment to each cluster + -> ephys.DefaultCluster + -> ProbeTrajectoryTemp + -> ephys.ChannelGroup + --- + -> reference.BrainRegion + """ + key_source = ephys.DefaultCluster * Provenance & \ + ProbeTrajectoryTemp & ephys.ChannelGroup & ChannelBrainLocationTemp + + def make(self, key): + # pdb.set_trace() + channel_raw_inds, channel_local_coordinates = \ + (ephys.ChannelGroup & key).fetch1( + 'channel_raw_inds', 'channel_local_coordinates') + channel = (ephys.DefaultCluster & key).fetch1('cluster_channel') + if channel in channel_raw_inds: + channel_coords = np.squeeze( + channel_local_coordinates[channel_raw_inds == channel]) + else: + return + + q = ChannelBrainLocationTemp & key & \ + dict(channel_lateral=channel_coords[0], + channel_axial=channel_coords[1]) + + if len(q) == 1: + key['ontology'], key['acronym'] = q.fetch1( + 'ontology', 'acronym') + + self.insert1(key) + elif len(q) > 1: + ontology, acronym = q.fetch('ontology', 'acronym') + if len(np.unique(acronym)) == 1: + key['ontology'] = 'CCF 2017' + key['acronym'] = acronym[0] + self.insert1(key) + else: + print('Conflict regions') + else: + return + + +@schema +class ProbeBrainRegionTemp(dj.Computed): + definition = """ + # Brain regions assignment to each probe insertion, including the regions of finest granularity and their upper-level areas. + -> ProbeTrajectoryTemp + -> reference.BrainRegion + """ + key_source = ProbeTrajectoryTemp & ClusterBrainRegionTemp + + def make(self, key): + + regions = (dj.U('acronym') & (ClusterBrainRegionTemp & key)).fetch('acronym') + + associated_regions = [ + atlas.BrainAtlas.get_parents(acronym) + for acronym in regions] + list(regions) + + self.insert([dict(**key, ontology='CCF 2017', acronym=region) + for region in np.unique(np.hstack(associated_regions))]) diff --git a/ibl_pipeline/ingest/__init__.py b/ibl_pipeline/ingest/__init__.py old mode 100644 new mode 100755 index 9f637093..b5954abc --- a/ibl_pipeline/ingest/__init__.py +++ b/ibl_pipeline/ingest/__init__.py @@ -54,6 +54,7 @@ ''' import logging import datajoint as dj +from tqdm import tqdm from . import alyxraw import os @@ -83,6 +84,7 @@ class InsertBuffer(object): def __init__(self, rel): self._rel = rel self._queue = [] + self._delete_queue = [] def insert1(self, r): self._queue.append(r) @@ -90,6 +92,9 @@ def insert1(self, r): def insert(self, recs): self._queue += recs + def delete1(self, r): + self._delete_queue.append(r) + def flush(self, replace=False, skip_duplicates=False, ignore_extra_fields=False, allow_direct_insert=False, chunksz=1): ''' @@ -117,3 +122,49 @@ def flush(self, replace=False, skip_duplicates=False, return qlen else: return 0 + + def flush_delete(self, chunksz=1, quick=True): + ''' + flush the buffer + XXX: ignore_extra_fields na, requires .insert() support + ''' + + qlen = len(self._delete_queue) + if qlen > 0 and qlen % chunksz == 0: + try: + if quick: + (self._rel & self._delete_queue).delete_quick() + else: + (self._rel & self._delete_queue).delete() + except Exception as e: + print('error in flush delete: {}, trying deletion one by one'.format(e)) + for t in self._delete_queue: + try: + if quick: + (self._rel & self._delete_queue).delete_quick() + else: + (self._rel & self._delete_queue).delete() + except Exception as e: + print('error in flush delete: {}'.format(e)) + self._delete_queue.clear() + return qlen + else: + return 0 + + +def populate_batch(t, chunksz=1000, verbose=True): + + keys = (t.key_source - t.proj()).fetch('KEY') + table = InsertBuffer(t) + for key in tqdm(keys, position=0): + entry = t.create_entry(key) + if entry: + table.insert1(entry) + + if table.flush( + skip_duplicates=True, + allow_direct_insert=True, chunksz=chunksz) and verbose: + print(f'Inserted {chunksz} {t.__name__} tuples.') + + if table.flush(skip_duplicates=True, allow_direct_insert=True) and verbose: + print(f'Inserted all remaining {t.__name__} tuples.') diff --git a/ibl_pipeline/ingest/acquisition.py b/ibl_pipeline/ingest/acquisition.py old mode 100644 new mode 100755 index 9188c419..b0c4cf4d --- a/ibl_pipeline/ingest/acquisition.py +++ b/ibl_pipeline/ingest/acquisition.py @@ -3,6 +3,7 @@ import uuid from . import alyxraw, reference, subject, action +from .. import acquisition from . import get_raw_field as grf schema = dj.schema(dj.config.get('database.prefix', '') + @@ -12,7 +13,7 @@ @schema class Session(dj.Computed): definition = """ - (session_uuid) -> alyxraw.AlyxRaw + -> alyxraw.AlyxRaw.proj(session_uuid='uuid') --- session_number=null: int subject_uuid: uuid @@ -28,7 +29,8 @@ class Session(dj.Computed): key_source = (alyxraw.AlyxRaw & 'model="actions.session"').proj( session_uuid='uuid') - def make(self, key): + @staticmethod + def create_entry(key): key_session = key.copy() key['uuid'] = key['session_uuid'] key_session['subject_uuid'] = uuid.UUID(grf(key, 'subject')) @@ -67,7 +69,12 @@ def make(self, key): if protocol != 'None': key_session['task_protocol'] = protocol - self.insert1(key_session) + return key_session + + def make(self, key): + + self.insert1( + Session.create_entry(key)) @schema @@ -123,3 +130,36 @@ class WaterAdministrationSession(dj.Manual): session_start_time: datetime wateradministrationsession_ts=CURRENT_TIMESTAMP: timestamp """ + + +@schema +class SessionQC(dj.Manual): + definition = """ + subject_uuid : uuid + session_start_time : datetime + --- + qc : tinyint unsigned + sessionqc_ts=CURRENT_TIMESTAMP: timestamp + """ + + +@schema +class SessionExtendedQC(dj.Manual): + definition = """ + subject_uuid : uuid + session_start_time : datetime + qc_type : varchar(16) + --- + extended_qc : tinyint unsigned + session_extended_qc_ts=CURRENT_TIMESTAMP: timestamp + """ + + class Field(dj.Part): + definition = """ + -> master + qc_fname : varchar(32) + --- + qc_fvalue_float=null : float + qc_fvalue_str=null : varchar(32) + qc_fvalue_blob=null : blob + """ diff --git a/ibl_pipeline/ingest/action.py b/ibl_pipeline/ingest/action.py old mode 100644 new mode 100755 index 2b2619c3..a4e89ac0 --- a/ibl_pipeline/ingest/action.py +++ b/ibl_pipeline/ingest/action.py @@ -1,4 +1,5 @@ import datajoint as dj +from datajoint.errors import DataJointError import uuid from . import alyxraw, reference, subject from . import get_raw_field as grf @@ -240,7 +241,7 @@ def make(self, key): key_surgery['subject_uuid'] = uuid.UUID(grf(key, 'subject')) if not len(subject.Subject & key_surgery): print('Subject {} is not in the table subject.Subject'.format( - key_surgery['surgery_uuid'] + key_surgery['subject_uuid'] )) key_surgery['surgery_start_time'] = grf(key, 'start_time') @@ -365,3 +366,102 @@ class OtherActionProcedure(dj.Manual): procedure_type_name: varchar(255) otheractionprocedure_ts=CURRENT_TIMESTAMP: timestamp """ + + +@schema +class CullMethod(dj.Computed): + definition = """ + -> alyxraw.AlyxRaw.proj(cull_method_uuid='uuid') + --- + cull_method: varchar(64) + cull_method_description='': varchar(255) + cull_method_ts=CURRENT_TIMESTAMP: timestamp + """ + key_source = (alyxraw.AlyxRaw & 'model="actions.cullmethod"').proj( + cull_method_uuid='uuid') + + def make(self, key): + key_cm = key.copy() + key['uuid'] = key['cull_method_uuid'] + key_cm['cull_method'] = grf(key, 'name') + + description = grf(key, 'description') + if description != 'None': + key_cm['cull_method_description'] = description + + self.insert1(key_cm) + + +@schema +class CullReason(dj.Computed): + definition = """ + -> alyxraw.AlyxRaw.proj(cull_reason_uuid='uuid') + --- + cull_reason: varchar(64) + cull_reason_description='': varchar(255) + cull_reason_ts=CURRENT_TIMESTAMP: timestamp + """ + key_source = (alyxraw.AlyxRaw & 'model="actions.cullreason"').proj( + cull_reason_uuid='uuid') + + def make(self, key): + key_cr = key.copy() + key['uuid'] = key['cull_reason_uuid'] + key_cr['cull_reason'] = grf(key, 'name') + + description = grf(key, 'description') + if description != 'None': + key_cr['cull_reason_description'] = description + + self.insert1(key_cr) + + +@schema +class Cull(dj.Computed): + definition = """ + -> alyxraw.AlyxRaw.proj(cull_uuid='uuid') + --- + subject_uuid: uuid + cull_date: date + cull_user=null: varchar(255) + cull_reason=null: varchar(64) + cull_method=null: varchar(64) + cull_description='': varchar(1024) + cull_ts=CURRENT_TIMESTAMP: timestamp + """ + key_source = (alyxraw.AlyxRaw & 'model = "actions.cull"').proj( + cull_uuid='uuid') + + def make(self, key): + key_cull = key.copy() + key['uuid'] = key['cull_uuid'] + + key_cull['subject_uuid'] = uuid.UUID(grf(key, 'subject')) + if not len(subject.Subject & key_cull): + print('Subject {} is not in the table subject.Subject'.format( + key_cull['subject_uuid'] + )) + return + + user_uuid = grf(key, 'user') + if user_uuid != 'None': + key_cull['cull_user'] = ( + reference.LabMember & {'user_uuid': user_uuid}).fetch1('user_name') + + cull_method_uuid = grf(key, 'cull_method') + if cull_method_uuid != 'None': + key_cull['cull_method'] = ( + CullMethod & {'cull_method_uuid': cull_method_uuid}).fetch1('cull_method') + + cull_reason_uuid = grf(key, 'cull_reason') + if cull_reason_uuid != 'None': + key_cull['cull_reason'] = ( + CullReason & {'cull_reason_uuid': cull_reason_uuid}).fetch1('cull_reason') + + description = grf(key, 'description') + if description != 'None': + key_cull['cull_description'] = description + + key_cull['cull_date'] = grf(key, 'date') + + self.insert1(key_cull) diff --git a/ibl_pipeline/ingest/alyxraw.py b/ibl_pipeline/ingest/alyxraw.py old mode 100644 new mode 100755 index 4b9ac5c8..3e22b3bb --- a/ibl_pipeline/ingest/alyxraw.py +++ b/ibl_pipeline/ingest/alyxraw.py @@ -1,5 +1,6 @@ import datajoint as dj + schema = dj.schema(dj.config.get('database.prefix', '') + 'ibl_alyxraw') @@ -21,3 +22,10 @@ class Field(dj.Part): fvalue=null: varchar(40000) # field value in the position of value_idx index (fname) ''' + + +@schema +class ProblematicData(dj.Manual): + definition = """ + -> AlyxRaw + """ diff --git a/ibl_pipeline/ingest/common.py b/ibl_pipeline/ingest/common.py new file mode 100755 index 00000000..eb143483 --- /dev/null +++ b/ibl_pipeline/ingest/common.py @@ -0,0 +1,16 @@ +''' +Pre-import all ingest modules +''' +from ibl_pipeline.ingest import alyxraw +from ibl_pipeline.ingest import reference as reference_ingest +from ibl_pipeline.ingest import subject as subject_ingest +from ibl_pipeline.ingest import action as action_ingest +from ibl_pipeline.ingest import acquisition as acquisition_ingest +from ibl_pipeline.ingest import data as data_ingest + +from os import environ + +mode = environ.get('MODE') +if mode != 'public': + from ibl_pipeline.ingest import ephys as ephys_ingest + from ibl_pipeline.ingest import histology as histology_ingest diff --git a/ibl_pipeline/ingest/data.py b/ibl_pipeline/ingest/data.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/ingest/ephys.py b/ibl_pipeline/ingest/ephys.py old mode 100644 new mode 100755 index 0bf7d754..8942a1c7 --- a/ibl_pipeline/ingest/ephys.py +++ b/ibl_pipeline/ingest/ephys.py @@ -1,4 +1,5 @@ import datajoint as dj +from datajoint.errors import DataJointError import json import uuid import re @@ -6,6 +7,8 @@ from . import alyxraw, reference, acquisition from . import get_raw_field as grf +from ibl_pipeline import acquisition as acquisition_real + schema = dj.schema(dj.config.get('database.prefix', '') + 'ibl_ingest_ephys') @@ -63,9 +66,16 @@ def make(self, key): key['uuid'] = key['probe_insertion_uuid'] session_uuid = grf(key, 'session') - subject_uuid, session_start_time = \ - (acquisition.Session & dict(session_uuid=session_uuid)).fetch1( - 'subject_uuid', 'session_start_time') + session_key = dict(session_uuid=session_uuid) + + try: + subject_uuid, session_start_time = \ + (acquisition_real.Session & session_key).fetch1( + 'subject_uuid', 'session_start_time') + except DataJointError: + subject_uuid, session_start_time = \ + (acquisition.Session & session_key).fetch1( + 'subject_uuid', 'session_start_time') key_pi.update( subject_uuid=subject_uuid, diff --git a/ibl_pipeline/ingest/histology.py b/ibl_pipeline/ingest/histology.py old mode 100644 new mode 100755 index 6616e780..15b545d0 --- a/ibl_pipeline/ingest/histology.py +++ b/ibl_pipeline/ingest/histology.py @@ -1,11 +1,16 @@ import datajoint as dj +from datajoint.errors import DataJointError import json import uuid import re +import pdb from . import alyxraw, reference, acquisition, ephys from . import get_raw_field as grf +from ibl_pipeline import acquisition as acquisition_real +from ibl_pipeline import ephys as ephys_real + schema = dj.schema(dj.config.get('database.prefix', '') + 'ibl_ingest_histology') @@ -45,7 +50,7 @@ class ProbeTrajectory(dj.Imported): trajectory_ts: datetime """ - key_source = (alyxraw.AlyxRaw & 'model="experiments.trajectoryestimate"').proj( + key_source = (alyxraw.AlyxRaw - alyxraw.ProblematicData & 'model="experiments.trajectoryestimate"').proj( probe_trajectory_uuid='uuid') def make(self, key): @@ -87,8 +92,12 @@ def make(self, key): (reference.CoordinateSystem & {'coordinate_system_uuid': coord_uuid}).fetch1( 'coordinate_system_name') - + # try: self.insert1(key_traj) + # except ValueError: + # alyxraw.ProblematicData.insert1( + # {'uuid': key_traj['probe_trajectory_uuid']} + # ) @schema @@ -146,3 +155,151 @@ def make(self, key): ) self.insert1(key_brain_loc) + + +# These tables will replace the above ones eventually + + +@schema +class Provenance(dj.Lookup): + definition = """ + provenance : tinyint unsigned # provenance code + --- + provenance_description : varchar(128) # type of trajectory + """ + contents = [ + (70, 'Ephys aligned histology track'), + (50, 'Histology track'), + (30, 'Micro-manipulator'), + (10, 'Planned'), + ] + + +@schema +class ProbeTrajectoryTemp(dj.Imported): + definition = """ + (probe_trajectory_uuid) -> alyxraw.AlyxRaw + --- + subject_uuid: uuid + session_start_time: datetime + probe_idx: int + x: float + y: float + z: float + depth: float + theta: float + phi: float + roll=null: float + provenance: tinyint unsigned + coordinate_system_name=null: varchar(32) + trajectory_ts: datetime + """ + + key_source = ((alyxraw.AlyxRaw - alyxraw.ProblematicData) & 'model="experiments.trajectoryestimate"').proj( + probe_trajectory_uuid='uuid') + + def make(self, key): + key_traj = key.copy() + key['uuid'] = key_traj['probe_trajectory_uuid'] + + probe_insertion_uuid = grf(key, 'probe_insertion') + probe_insertion_key = dict(probe_insertion_uuid=probe_insertion_uuid) + + try: + subject_uuid, session_start_time, probe_idx = \ + (ephys_real.ProbeInsertion & probe_insertion_key).fetch1( + 'subject_uuid', 'session_start_time', 'probe_idx') + except DataJointError: + subject_uuid, session_start_time, probe_idx = \ + (ephys.ProbeInsertion & probe_insertion_key).fetch1( + 'subject_uuid', 'session_start_time', 'probe_idx') + + key_traj.update( + x=grf(key, 'x'), + y=grf(key, 'y'), + z=grf(key, 'z'), + depth=grf(key, 'depth'), + theta=grf(key, 'theta'), + phi=grf(key, 'phi'), + subject_uuid=subject_uuid, + session_start_time=session_start_time, + probe_idx=probe_idx, + provenance=grf(key, 'provenance'), + trajectory_ts=grf(key, 'datetime') + ) + + roll = grf(key, 'roll') + if roll != 'None': + key_traj.update(roll=roll) + + coord_uuid = grf(key, 'coordinate_system') + if coord_uuid != 'None': + key['coordinate_system_uuid'] = \ + (reference.CoordinateSystem & + {'coordinate_system_uuid': coord_uuid}).fetch1( + 'coordinate_system_name') + + self.insert1(key_traj) + + +@schema +class ChannelBrainLocationTemp(dj.Imported): + definition = """ + (channel_brain_location_uuid) -> alyxraw.AlyxRaw + --- + subject_uuid : uuid + session_start_time : datetime + probe_idx : tinyint + channel_axial : decimal(6, 1) + channel_lateral : decimal(6, 1) + channel_x : decimal(6, 1) + channel_y : decimal(6, 1) + channel_z : decimal(6, 1) + provenance : tinyint unsigned + ontology : varchar(32) + acronym : varchar(32) + """ + key_source = (alyxraw.AlyxRaw & 'model="experiments.channel"').proj( + channel_brain_location_uuid='uuid') + + @classmethod + def create_entry(cls, key): + key_brain_loc = key.copy() + key['uuid'] = key_brain_loc['channel_brain_location_uuid'] + + probe_trajectory_uuid = grf(key, 'trajectory_estimate') + try: + subject_uuid, session_start_time, probe_idx, provenance = \ + (ProbeTrajectoryTemp & dict( + probe_trajectory_uuid=probe_trajectory_uuid)).fetch1( + 'subject_uuid', 'session_start_time', 'probe_idx', + 'provenance') + except Exception: + print('Non exisiting trajectory: {}'.format(probe_trajectory_uuid)) + return None + + brain_region_pk = grf(key, 'brain_region') + ontology, acronym = (reference.BrainRegion & + dict(brain_region_pk=brain_region_pk)).fetch1( + 'ontology', 'acronym') + + key_brain_loc.update( + channel_x=grf(key, 'x'), + channel_y=grf(key, 'y'), + channel_z=grf(key, 'z'), + channel_axial=grf(key, 'axial'), + channel_lateral=grf(key, 'lateral'), + subject_uuid=subject_uuid, + session_start_time=session_start_time, + probe_idx=probe_idx, + provenance=provenance, + ontology=ontology, + acronym=acronym + ) + return key_brain_loc + + def make(self, key): + + entry = ChannelBrainLocationTemp.create_entry(key) + if entry: + self.insert1(entry) diff --git a/ibl_pipeline/ingest/ingest_alyx_raw.py b/ibl_pipeline/ingest/ingest_alyx_raw.py deleted file mode 100755 index 2701092e..00000000 --- a/ibl_pipeline/ingest/ingest_alyx_raw.py +++ /dev/null @@ -1,144 +0,0 @@ -''' -This script load the json dump and insert the tuples into the alyxraw table. -''' - -import datajoint as dj -import json -import logging -import math -import collections -import os.path as path -from ibl_pipeline.ingest import alyxraw, InsertBuffer -import sys -import uuid -import re -from tqdm import tqdm -import numpy as np - - -logger = logging.getLogger(__name__) - - -def get_alyx_entries(filename=None, models=None, - exclude=None): - - exclude_list = {'auth.group', 'sessions.session', - 'authtoken.token', - 'experiments.brainregion'} - if exclude: - exclude_list = exclude_list.union(set(exclude)) - - if not filename: - filename = path.join('/', 'data', 'alyxfull.json') - - with open(filename, 'r') as fid: - keys_all = json.load(fid) - - if not models: - return [key for key in keys_all if key['model'] not in exclude_list] - elif isinstance(models, str): - return [key for key in keys_all if key['model'] == models] - - elif isinstance(models, list): - return [key for key in keys_all if key['model'] in models] - else: - raise ValueError('models should be a str, list or numpy array') - - -def insert_to_alyxraw(keys): - - # use insert buffer to speed up the insertion process - ib_main = InsertBuffer(alyxraw.AlyxRaw) - ib_part = InsertBuffer(alyxraw.AlyxRaw.Field) - - # insert into AlyxRaw table - for key in tqdm(keys, position=0): - try: - pk = uuid.UUID(key['pk']) - except Exception: - print('Error for key: {}'.format(key)) - continue - - ib_main.insert1(dict(uuid=pk, model=key['model'])) - if ib_main.flush(skip_duplicates=True, chunksz=10000): - logger.debug('Inserted 10000 raw tuples.') - # print('Inserted 10000 raw tuples.') - - if ib_main.flush(skip_duplicates=True): - logger.debug('Inserted remaining raw tuples') - # print('Inserted remaining raw tuples') - - # insert into the part table AlyxRaw.Field - for ikey, key in tqdm(enumerate(keys), position=0): - try: - try: - pk = uuid.UUID(key['pk']) - except ValueError: - print('Error for key: {}'.format(key)) - continue - - key_field = dict(uuid=uuid.UUID(key['pk'])) - for field_name, field_value in key['fields'].items(): - key_field = dict(key_field, fname=field_name) - - if field_name == 'json' and field_value is not None: - - key_field['value_idx'] = 0 - key_field['fvalue'] = json.dumps(field_value) - if len(key_field['fvalue']) < 10000: - ib_part.insert1(key_field) - else: - continue - if field_name == 'narrative' and field_value is not None: - # filter out emoji - emoji_pattern = re.compile( - "[" - u"\U0001F600-\U0001F64F" # emoticons - u"\U0001F300-\U0001F5FF" # symbols & pictographs - u"\U0001F680-\U0001F6FF" # transport & map symbols - u"\U0001F1E0-\U0001F1FF" # flags (iOS) - u"\U00002702-\U000027B0" - u"\U000024C2-\U0001F251" - "]+", flags=re.UNICODE) - - key_field['value_idx'] = 0 - key_field['fvalue'] = emoji_pattern.sub(r'', field_value) - - elif field_value is None or field_value == '' or field_value == [] or \ - (isinstance(field_value, float) and math.isnan(field_value)): - key_field['value_idx'] = 0 - key_field['fvalue'] = 'None' - ib_part.insert1(key_field) - - elif type(field_value) is list and \ - (type(field_value[0]) is dict or type(field_value[0]) is str): - for value_idx, value in enumerate(field_value): - key_field['value_idx'] = value_idx - key_field['fvalue'] = str(value) - ib_part.insert1(key_field) - else: - key_field['value_idx'] = 0 - key_field['fvalue'] = str(field_value) - ib_part.insert1(key_field) - - if ib_part.flush(skip_duplicates=True, chunksz=10000): - logger.debug('Inserted 10000 raw field tuples') - # print('Inserted 10000 raw field tuples') - except Exception as e: - print('Problematic entry:{}'.format(ikey)) - raise - - if ib_part.flush(skip_duplicates=True): - logger.debug('Inserted all remaining raw field tuples') - # print('Inserted all remaining raw field tuples') - - -if __name__ == '__main__': - - if len(sys.argv) < 2: # no arguments given - # if no argument given, assume a canonical file location and name - filename = path.join('/', 'data', 'alyxfull.json') - else: - filename = path.join(dir_name, sys.argv[1]) - - insert_to_alyxraw(get_alyx_entries(filename)) diff --git a/ibl_pipeline/ingest/ingest_utils.py b/ibl_pipeline/ingest/ingest_utils.py old mode 100644 new mode 100755 index 1cb90aa1..464946ad --- a/ibl_pipeline/ingest/ingest_utils.py +++ b/ibl_pipeline/ingest/ingest_utils.py @@ -2,6 +2,9 @@ Utility functions for ingestion ''' import traceback +import json +import gzip +import time def copy_table(target_schema, src_schema, table_name, fresh=False, **kwargs): @@ -14,7 +17,7 @@ def copy_table(target_schema, src_schema, table_name, fresh=False, **kwargs): try: target_table.insert(src_table - target_table.proj(), skip_duplicates=True, **kwargs) - except Exception as e: + except Exception: for t in (src_table - target_table.proj()).fetch(as_dict=True): try: if table_name == 'DataSet' and \ diff --git a/ibl_pipeline/ingest/job.py b/ibl_pipeline/ingest/job.py new file mode 100755 index 00000000..64f49d8a --- /dev/null +++ b/ibl_pipeline/ingest/job.py @@ -0,0 +1,64 @@ + +import datajoint as dj + + +schema = dj.schema(dj.config.get('database.prefix', '') + + 'ibl_ingest_job') + + +@schema +class TimeZone(dj.Lookup): + definition = """ + timezone: varchar(16) + """ + contents = zip(['European', 'EST', 'PST', 'other']) + + +@schema +class Job(dj.Manual): + definition = """ + job_date : date + -> TimeZone.proj(job_timezone='timezone') + --- + alyx_current_timestamp : datetime + alyx_previous_timestamp : datetime + created_pks : longblob + modified_pks : longblob + deleted_pks : longblob + modified_pks_important=null : longblob + session_prefiltered=0: bool + job_ts=CURRENT_TIMESTAMP : timestamp + """ + + +@schema +class Task(dj.Lookup): + definition = """ + task : varchar(64) + --- + task_order : tinyint + task_description='' : varchar(1024) + """ + contents = [ + ['Delete alyxraw', 1, 'Delete alyxraw and shadow table entries for updated and deleted records'], + ['Delete shadow membership', 2, 'Delete shadow membership records for updated and deleted records'], + ['Ingest alyxraw', 3, 'Ingest to alyxraw'], + ['Ingest shadow', 4, 'Ingest to alyx shadow tables'], + ['Ingest shadow membership', 5, 'Ingest to alyx shadow membership tables'], + ['Ingest real', 6, 'Ingest to alyx real tables'], + ['Update fields', 7, 'Update fields in real tables'], + ['Populate behavior', 8, 'Populate behavior tables'] + ] + + +@schema +class TaskStatus(dj.Manual): + definition = """ + -> Job + -> Task + --- + task_start_time : datetime + task_end_time : datetime + task_duration : float # in mins + task_status_comments='' : varchar(1000) + """ diff --git a/ibl_pipeline/ingest/qc.py b/ibl_pipeline/ingest/qc.py new file mode 100644 index 00000000..36c5f99f --- /dev/null +++ b/ibl_pipeline/ingest/qc.py @@ -0,0 +1,114 @@ +import datajoint as dj +import json + +from . import alyxraw, reference, subject, action, acquisition +from .. import acquisition as acquisition_real +from .. import qc +from . import get_raw_field as grf + +schema = dj.schema(dj.config.get('database.prefix', '') + + 'ibl_ingest_qc') + + +@schema +class SessionQC(dj.Manual): + definition = """ + subject_uuid : uuid + session_start_time : datetime + --- + qc : tinyint unsigned + sessionqc_ts=CURRENT_TIMESTAMP: timestamp + """ + + +@schema +class SessionExtendedQC(dj.Manual): + definition = """ + subject_uuid : uuid + session_start_time : datetime + qc_type : varchar(16) + --- + extended_qc : tinyint unsigned + session_extended_qc_ts=CURRENT_TIMESTAMP: timestamp + """ + + class Field(dj.Part): + definition = """ + -> master + qc_fname : varchar(32) + --- + qc_fvalue_float=null : float + qc_fvalue_str=null : varchar(32) + qc_fvalue_blob=null : blob + """ + + +qc_types = qc.QCType.fetch('qc_type') +qc_choices = qc.QCChoice.fetch(format='frame') + + +@schema +class SessionQCIngest(dj.Computed): + definition = """ + -> alyxraw.AlyxRaw.proj(session_uuid='uuid') + """ + key_source = dj.U('session_uuid') & \ + (alyxraw.AlyxRaw.Field & + (alyxraw.AlyxRaw & 'model="actions.session"') & + 'fname="qc"' & + 'fvalue in ("10", "30", "40", "50")').proj(session_uuid='uuid') + + def make(self, key): + + self.insert1(key) + + key['uuid'] = key['session_uuid'] + qc = grf(key, 'qc') + qc_extended = grf(key, 'extended_qc') + + try: + qc_extended = json.loads(qc_extended) + except json.decoder.JSONDecodeError: + qc_extended = qc_extended.replace("\'", "\"") + qc_extended = qc_extended.replace('None', "\"None\"") + qc_extended = json.loads(qc_extended) + + if len(acquisition.Session & key) == 1: + session_key = (acquisition_real.Session & key).fetch1('KEY') + else: + session_key = (acquisition.Session & key).fetch1('KEY') + + SessionQC.insert1( + dict(**session_key, qc=int(qc)) + ) + + for qc_type in qc_types: + try: + session_qc_type = qc_extended[qc_type] + qc_choice = qc_choices[ + qc_choices['qc_label'] == session_qc_type].index[0] + SessionExtendedQC.insert1( + dict(**session_key, + qc_type=qc_type, + extended_qc=qc_choice) + ) + for k, v in qc_extended.items(): + if f'_{qc_type}' in k: + qc_field = dict( + **session_key, + qc_type=qc_type, + qc_fname=k) + if type(v) == float: + qc_fvalue_name = 'qc_fvalue_float' + elif v == "None": + pass + elif type(v) == str: + qc_fvalue_name = 'qc_fvalue_varchar' + else: + qc_fvalue_name = 'qc_fvalue_blob' + + SessionExtendedQC.Field.insert1( + {**qc_field, + qc_fvalue_name: v}) + except Exception: + pass diff --git a/ibl_pipeline/ingest/reference.py b/ibl_pipeline/ingest/reference.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/ingest/subject.py b/ibl_pipeline/ingest/subject.py old mode 100644 new mode 100755 index d10068fb..c7f2c2f0 --- a/ibl_pipeline/ingest/subject.py +++ b/ibl_pipeline/ingest/subject.py @@ -240,6 +240,12 @@ def make(self, key): if sex != 'None': key_subject['sex'] = sex + strain_uuid = grf(key, 'strain') + if strain_uuid != 'None': + key_subject['subject_strain'] = \ + (Strain & dict(strain_uuid=uuid.UUID(strain_uuid))).fetch1( + 'strain_name') + birth_date = grf(key, 'birth_date') if birth_date != 'None': key_subject['subject_birth_date'] = birth_date @@ -509,7 +515,12 @@ def make(self, key): key_cage['cage_name'] = grf(key, 'cage') json_content = grf(key, 'json') if json_content != 'None': - json_dict = json.loads(json_content) + try: + json_dict = json.loads(json_content) + except json.decoder.JSONDecodeError: + json_content = json_content.replace("\'", "\"") + json_dict = json.loads(json_content) + history = json_dict['history'] if 'cage' not in history: self.insert1(key_cage, skip_duplicates=True) diff --git a/ibl_pipeline/job/patch.py b/ibl_pipeline/patch/patch.py similarity index 95% rename from ibl_pipeline/job/patch.py rename to ibl_pipeline/patch/patch.py index 103a6fe2..a4806c46 100755 --- a/ibl_pipeline/job/patch.py +++ b/ibl_pipeline/patch/patch.py @@ -29,8 +29,10 @@ 'ephys.GoodCluster', 'ephys.AlignedTrialSpikes', 'histology.ClusterBrainRegion', + 'ephys.ChannelGroup', 'ephys.DefaultCluster.Ks2Label', 'ephys.DefaultCluster.Metrics', + 'ephys.DefaultCluster.Metric', 'ephys.DefaultCluster', 'ephys.CompleteClusterSession', 'behavior_plotting.SessionPsychCurve', @@ -159,7 +161,7 @@ class Run(dj.Manual): job_status='' : enum('Success', 'Partial Success', 'Error', '') """ - def _delete_table(self, t, key, table_type='session'): + def _delete_table(self, t, key, table_type='session', save_status=True): key_del = key.copy() if table_type == 'virtual': @@ -176,8 +178,9 @@ def _delete_table(self, t, key, table_type='session'): else: original = False - RunStatus.TableStatus.insert1( - dict(**key_table, original=original), skip_duplicates=True) + if save_status: + RunStatus.TableStatus.insert1( + dict(**key_table, original=original), skip_duplicates=True) print('Deleting table {} ...'.format(t['full_table_name'])) if t['full_table_name'] == '`ibl_ephys`.`__aligned_trial_spikes`': @@ -186,12 +189,14 @@ def _delete_table(self, t, key, table_type='session'): (table_class & cluster).delete_quick() else: (table_class & key_del).delete_quick() - dj.Table._update( - RunStatus.TableStatus & key_table, - 'status', 'Deleted') - dj.Table._update( - RunStatus.TableStatus & key_table, - 'delete_time', datetime.datetime.now()) + + if save_status: + dj.Table._update( + RunStatus.TableStatus & key_table, + 'status', 'Deleted') + dj.Table._update( + RunStatus.TableStatus & key_table, + 'delete_time', datetime.datetime.now()) def make(self, key): @@ -285,7 +290,7 @@ def make(self, key): if self & key: dj.Table._update(self & key, 'job_status', job_status) else: - self.insert1(key) + self.insert1(dict(**key, job_status=job_status)) def populate(self, *restrictions, level='New', display_progress=False): diff --git a/ibl_pipeline/plotting/__init__.py b/ibl_pipeline/plotting/__init__.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/plotting/behavior.py b/ibl_pipeline/plotting/behavior.py old mode 100644 new mode 100755 index 7f023807..5ae2de51 --- a/ibl_pipeline/plotting/behavior.py +++ b/ibl_pipeline/plotting/behavior.py @@ -969,7 +969,7 @@ def make(self, key): last_sessions = last_sessions * acquisition.Session * \ behavior.SessionTrainingStatus - filerecord = data.FileRecord & subjects & 'relative_path LIKE "%alf%"' + filerecord = data.FileRecord & subjects.fetch('KEY') & 'relative_path LIKE "%alf%"' last_filerecord = subjects.aggr( filerecord, latest_session_on_flatiron='max(session_start_time)') diff --git a/ibl_pipeline/plotting/ephys.py b/ibl_pipeline/plotting/ephys.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/plotting/ephys_plotting.py b/ibl_pipeline/plotting/ephys_plotting.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/plotting/figure_model.py b/ibl_pipeline/plotting/figure_model.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/plotting/plotting_utils_behavior.py b/ibl_pipeline/plotting/plotting_utils_behavior.py old mode 100644 new mode 100755 index 16e2bf91..abc20304 --- a/ibl_pipeline/plotting/plotting_utils_behavior.py +++ b/ibl_pipeline/plotting/plotting_utils_behavior.py @@ -1,6 +1,10 @@ from ibl_pipeline.analyses import behavior from ibl_pipeline import behavior as behavior_ingest -from ibl_pipeline import subject, action, acquisition, ephys +from ibl_pipeline import subject, action, acquisition +from os import environ +mode = environ.get('MODE') +if mode != 'public': + from ibl_pipeline import ephys from ibl_pipeline.utils import psychofit as psy from uuid import UUID import numpy as np @@ -151,9 +155,12 @@ def get_status(subj): first_ready4recording = subj.aggr( behavior.SessionTrainingStatus & 'training_status = "ready4recording"', first_session='DATE(min(session_start_time))') - first_ephys_session = subj.aggr( - behavior.SessionTrainingStatus & ephys.ProbeInsertion, - first_session='DATE(min(session_start_time))') + if mode != 'public': + first_ephys_session = subj.aggr( + behavior.SessionTrainingStatus & ephys.ProbeInsertion, + first_session='DATE(min(session_start_time))') + else: + first_ephys_session = [] result = dict() if len(first_trained_1a): diff --git a/ibl_pipeline/plotting/plotting_utils_ephys.py b/ibl_pipeline/plotting/plotting_utils_ephys.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/plotting/utils.py b/ibl_pipeline/plotting/utils.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/process/__init__.py b/ibl_pipeline/process/__init__.py new file mode 100755 index 00000000..ecb4b2aa --- /dev/null +++ b/ibl_pipeline/process/__init__.py @@ -0,0 +1,37 @@ +from ibl_pipeline.ingest import alyxraw +from ibl_pipeline.utils import is_valid_uuid +import datetime + + +def get_timezone(t=datetime.datetime.now().time()): + if t < datetime.time(8, 30): + timezone = 'European' + elif t > datetime.time(8, 30) and t < datetime.time(10, 30): + timezone = 'EST' + elif t > datetime.time(10, 30) and t < datetime.time(14, 30): + timezone = 'PST' + else: + timezone = 'other' + return timezone + + +def get_important_pks(pks, return_original_dict=False): + ''' + Filter out modified keys that belongs to data.filerecord and jobs.task + :params modified_keys: list of pks + :params optional return original_dict: boolean, if True, return the list of dictionaries with uuids to be the key + :returns pks_important: list of filtered pks + :returns pks_dict: list of dictionary with uuid as the key + ''' + + pks = [pk for pk in pks if is_valid_uuid(pk)] + pks_dict = [{'uuid': pk} for pk in pks] + pks_unimportant = [ + str(pk['uuid']) + for pk in (alyxraw.AlyxRaw & 'model in ("data.filerecord", "jobs.task")' & pks_dict).fetch('KEY')] + pks_important = list(set(pks) - set(pks_unimportant)) + + if return_original_dict: + return pks_important, pks_dict + else: + return pks_important diff --git a/ibl_pipeline/process/autoprocess.py b/ibl_pipeline/process/autoprocess.py new file mode 100755 index 00000000..a2219f93 --- /dev/null +++ b/ibl_pipeline/process/autoprocess.py @@ -0,0 +1,203 @@ + +from ibl_pipeline.process import ( + create_ingest_task, + delete_update_entries, + ingest_alyx_raw, + ingest_membership, + ingest_shadow, + ingest_real, + populate_behavior, + get_timezone, + process_histology +) +from ibl_pipeline.ingest import job +from os import path +import datetime +import time +from tqdm import tqdm + + +def ingest_status(job_key, task, start, end): + + job.TaskStatus.insert1( + dict( + **job_key, + task=task, + task_start_time=start, + task_end_time=end, + task_duration=(end-start).total_seconds()/60., + ), + skip_duplicates=True + ) + + +def process_new(previous_dump=None, latest_dump=None, + job_date=datetime.date.today().strftime('%Y-%m-%d'), + timezone='other'): + + job_key = dict( + job_date=job_date, + job_timezone=timezone, + ) + + if previous_dump is None: + previous_dump = path.join('/', 'data', 'alyxfull.json.last') + + if latest_dump is None: + latest_dump = path.join('/', 'data', 'alyxfull.json') + + print('Comparing json dumps ...') + create_ingest_task.compare_json_dumps(previous_dump, latest_dump) + + created_pks, modified_pks, deleted_pks, modified_pks_important = ( + job.Job & job_key).fetch1( + 'created_pks', 'modified_pks', 'deleted_pks', 'modified_pks_important') + + print('Deleting modified entries from alyxraw and shadow tables...') + start = datetime.datetime.now() + + delete_update_entries.delete_entries_from_alyxraw( + modified_pks, modified_pks_important) + + ingest_status(job_key, 'Delete alyxraw', start, end=datetime.datetime.now()) + + print('Deleting modified entries from membership tables...') + start = datetime.datetime.now() + delete_update_entries.delete_entries_from_membership( + modified_pks_important) + ingest_status(job_key, 'Delete shadow membership', start, + end=datetime.datetime.now()) + + print('Ingesting into alyxraw...') + start = datetime.datetime.now() + ingest_alyx_raw.insert_to_alyxraw( + ingest_alyx_raw.get_alyx_entries( + latest_dump, new_pks=created_pks+modified_pks)) + ingest_status(job_key, 'Ingest alyxraw', start, end=datetime.datetime.now()) + + print('Ingesting into shadow tables...') + start = datetime.datetime.now() + ingest_shadow.main(modified_pks=modified_pks_important) + ingest_status(job_key, 'Ingest shadow', start, end=datetime.datetime.now()) + + print('Ingesting into shadow membership tables...') + start = datetime.datetime.now() + ingest_membership.main(created_pks+modified_pks_important) + ingest_status(job_key, 'Ingest shadow membership', start, + end=datetime.datetime.now()) + + print('Ingesting alyx real...') + start = datetime.datetime.now() + ingest_real.main() + ingest_status(job_key, 'Ingest real', start, end=datetime.datetime.now()) + + print('Updating fields...') + start = datetime.datetime.now() + delete_update_entries.update_entries_from_real_tables( + modified_pks_important) + ingest_status(job_key, 'Update fields', start, end=datetime.datetime.now()) + + print('Ingesting behavior...') + start = datetime.datetime.now() + populate_behavior.main(backtrack_days=30) + ingest_status(job_key, 'Populate behavior', start, + end=datetime.datetime.now()) + + +def process_public(): + + from ibl_pipeline import public + from ibl_pipeline.common import subject, acquisition + + ingest_alyx_raw.insert_to_alyxraw( + ingest_alyx_raw.get_alyx_entries()) + + excluded_tables = [ + 'Weighing', + 'WaterType', + 'WaterAdministration', + 'WaterRestriction', + 'ProbeModel', + 'ProbeInsertion', + 'ProbeTrajectory' + ] + + ingest_shadow.main(excluded_tables=excluded_tables) + + excluded_membership_tables = [ + 'WaterRestrictionUser', + 'WaterRestrictionProcedure', + 'SurgeryUser', + 'WaterAdministrationSession', + ] + + ingest_membership.main( + excluded_tables=excluded_membership_tables) + + ingest_real.main( + excluded_tables=excluded_tables+excluded_membership_tables, + public=True) + + # delete non-releasing tables + from ibl_pipeline.ingest import InsertBuffer + + table = InsertBuffer(acquisition.Session) + for key in tqdm( + (acquisition.Session - public.PublicSession - behavior.TrialSet).fetch('KEY')): + table.delete1(key) + if table.flush_delete(chunksz=100): + print('Deleted 100 sessions') + + table.flush_delete() + print('Deleted the rest of the sessions') + + subjs = subject.Subject & acquisition.Session + + for key in tqdm( + (subject.Subject - public.PublicSubjectUuid - subjs.proj()).fetch('KEY')): + (subject.Subject & key).delete() + + excluded_behavior_tables = [ + 'AmbientSensorData', + 'Settings', + 'SessionDelay' + ] + + populate_behavior.main(excluded_tables=excluded_behavior_tables) + + +def process_updates(pks, current_dump='/data/alyxfull.json'): + ''' + Update the all the fields in givens a set of pks + :param pks: uuids where an update is needed + :param current_dump: the latest + ''' + print('Deleting from alyxraw...') + delete_update_entries.delete_entries_from_alyxraw( + modified_pks_important=pks) + print('Deleting from shadow membership...') + delete_update_entries.delete_entries_from_membership(pks) + + print('Ingesting alyxraw...') + ingest_alyx_raw.insert_to_alyxraw( + ingest_alyx_raw.get_alyx_entries( + current_dump, new_pks=pks)) + + print('Ingesting into shadow tables...') + ingest_shadow.main(excluded_tables=['DataSet', 'FileRecord']) + + print('Ingesting into shadow membership tables...') + ingest_membership.main(pks) + + print('Ingesting alyx real...') + ingest_real.main(excluded_tables=['DataSet', 'FileRecord']) + + print('Updating field...') + delete_update_entries.update_entries_from_real_tables(pks) + + +if __name__ == '__main__': + + process_new(previous_dump='/data/alyxfull.json.last', + latest_dump='/data/alyxfull_20201128_0400.json', + job_date='2020-11-28', timezone='European') diff --git a/ibl_pipeline/process/create_ingest_task.py b/ibl_pipeline/process/create_ingest_task.py new file mode 100644 index 00000000..8656859b --- /dev/null +++ b/ibl_pipeline/process/create_ingest_task.py @@ -0,0 +1,135 @@ +import os, gc, json, datetime +from ibl_pipeline.ingest import job +from ibl_pipeline.process import get_important_pks, get_timezone +from ibl_pipeline.utils import is_valid_uuid + + +SESSION_FIELDS = [ + 'location', 'subject', 'lab', 'start_time', + 'end_time', 'parent_session', 'project', 'type', + 'task_protocol', 'users', 'procedures'] + + +def get_modified_pks(data0, data1): + d0 = {_['pk']: json.dumps(_['fields'], sort_keys=True) for _ in data0} + d1 = {_['pk']: json.dumps(_['fields'], sort_keys=True) for _ in data1} + d0 = {k: v for k, v in d0.items() if k in d1.keys()} + d1 = {k: v for k, v in d1.items() if k in d0.keys()} + + return [k for k in d0.keys() if d0[k] != d1[k] and is_valid_uuid(k)] + + +def get_created_deleted_pks(data0, data1): + + old_pks = {_['pk'] for _ in data0} + new_pks = {_['pk'] for _ in data1} + + return [pk for pk in sorted(new_pks - old_pks) if is_valid_uuid(pk)], \ + [pk for pk in sorted(old_pks - new_pks) if is_valid_uuid(pk)] + + +def filter_modified_keys_session(data0, data1, modified_pks): + + sessions0 = {_['pk']: json.dumps({key: _['fields'][key] for key in SESSION_FIELDS}, sort_keys=True) + for _ in data0 if _['model'] == 'actions.session'} + sessions1 = {_['pk']: json.dumps({key: _['fields'][key] for key in SESSION_FIELDS}, sort_keys=True) + for _ in data1 if _['model'] == 'actions.session'} + sessions_same = dict(sessions0.items() & sessions1.items()).keys() + return list(set(modified_pks) - set(sessions_same)) + + +def compare_json_dumps(previous_dump='/data/alyxfull.json', + latest_dump='/data/alyxfull.json.last', + create_files=True, insert_to_table=True, + filter_pks_for_unused_models=True, + filter_pks_for_unused_session_fields=True): + + """Compare two json dumps from alyx and created files with the added, deleted, modified fields. + + Args: + previous_dump (json filepath, optional): filepath of alyx json dump of the last ingestion Defaults to /data/alyxfull.json. + latest_dump (json filepath, optional): filepath of alyx json dump of the current ingestion. Defaults to '/data/alyxfull.json.last' + create_files (bool, optional): whether to create files saving the created, deleted, modified keys. Defaults to True. + insert_to_table (bool, optional): whether to insert the result to DataJoint job table. Defaults to True. + filter_pks_for_unused_models (bool, optional): filter modified pks in models of interest. Defaults to True. + filter_pks_for_unused_session_fields (bool, optional): only keep the modified keys when there is a change in fields of interest. Defaults to True. + + """ + + print("Loading first JSON dump...") + with open(previous_dump, 'r') as f: + data0 = json.load(f) + print("Loading second JSON dump...") + with open(latest_dump, 'r') as f: + data1 = json.load(f) + print("Finished loading JSON dumps.") + + print("Computing differences...") + modified_pks = get_modified_pks(data0, data1) + + print("Finished creating modified keys.") + print("Computing created and deleted_keys...") + + created_pks, deleted_pks = get_created_deleted_pks(data0, data1) + + print("Finished creating created_pks and deleted_pks.") + + if filter_pks_for_unused_session_fields: + print('Filtering modified sessions that does not have a change in fields of interest...') + modified_pks = filter_modified_keys_session(data0, data1, modified_pks) + + if filter_pks_for_unused_models: + print('Remove modified entries in tables data.filerecord and jobs.task') + modified_pks_important = get_important_pks(modified_pks) + + # figure out job date and timezone + latest_modified_time = datetime.datetime.fromtimestamp( + os.path.getmtime(latest_dump)) + d = latest_modified_time.date() + t = latest_modified_time.time() + previous_modified_time = datetime.datetime.fromtimestamp( + os.path.getmtime(previous_dump)) + + timezone = get_timezone(t) + + if create_files: + suffix = f'_{latest_modified_time.strftime("%Y-%m-%d")}_{timezone}' + root_dir = '/data/daily_increments/' + print(f"New objects: {len(created_pks)}") + with open(f"{root_dir}created_pks_{suffix}.json", "w") as f: + json.dump(created_pks, f) + print(f"Deleted objects: {len(deleted_pks)}") + with open(f"{root_dir}deleted_pks_{suffix}.json", "w") as f: + json.dump(deleted_pks, f) + print(f"Modified objects: {len(modified_pks)}") + with open(f"{root_dir}modified_pks_{suffix}.json", "w") as f: + json.dump(modified_pks, f) + print(f"Important modified objects: {len(modified_pks_important)}") + + if filter_pks_for_unused_models: + with open(f"{root_dir}modified_pks_important{suffix}.json", "w") as f: + json.dump(modified_pks_important, f) + + if insert_to_table: + entry = dict( + job_date=d, + job_timezone=timezone, + alyx_current_timestamp=latest_modified_time, + alyx_previous_timestamp=previous_modified_time, + created_pks=created_pks, + modified_pks=modified_pks, + deleted_pks=deleted_pks, + session_prefiltered=filter_pks_for_unused_session_fields + ) + if not filter_pks_for_unused_models: + modified_pks_important = None + + job.Job.insert1( + dict(**entry, + modified_pks_important=modified_pks_important), + skip_duplicates=True) + + +if __name__ == '__main__': + + compare_json_dumps(insert_to_table=False) diff --git a/ibl_pipeline/process/delete_update_entries.py b/ibl_pipeline/process/delete_update_entries.py new file mode 100755 index 00000000..20b0dbd3 --- /dev/null +++ b/ibl_pipeline/process/delete_update_entries.py @@ -0,0 +1,219 @@ +''' +This module delete the entries from alyxraw, shadow membership_tables and update real membership_tables +''' +import datajoint as dj +from ibl_pipeline.process.ingest_membership import membership_tables +from ibl_pipeline.common import * +from ibl_pipeline.ingest.common import * +from ibl_pipeline.ingest import job, InsertBuffer +from ibl_pipeline.ingest import ingest_utils +from ibl_pipeline import update +from uuid import UUID +from tqdm import tqdm +import pdb +from ibl_pipeline.utils import is_valid_uuid +from ibl_pipeline.process import get_important_pks +import datetime + + +# ====================================== functions for deletion ================================== + +def delete_entries_from_alyxraw(pks_to_be_deleted=[], modified_pks_important=[]): + + ''' + Delete entries from alyxraw and shadow membership_tables, excluding the membership table. + ''' + + print('Deleting alyxraw entries corresponding to file records...') + + if pks_to_be_deleted: + if len(pks_to_be_deleted) > 5000: + file_record_fields = alyxraw.AlyxRaw.Field & \ + 'fname = "exists"' & 'fvalue = "false"' + else: + file_record_fields = alyxraw.AlyxRaw.Field & \ + 'fname = "exists"' & 'fvalue = "false"' & \ + [{'uuid': pk} for pk in pks_to_be_deleted] + + for key in tqdm(file_record_fields): + (alyxraw.AlyxRaw.Field & key).delete_quick() + + if modified_pks_important: + pk_list = [{'uuid': pk} for pk in modified_pks_important + if is_valid_uuid(pk)] + (alyxraw.AlyxRaw & 'model != "actions.session"' & + pk_list).delete() + (alyxraw.AlyxRaw.Field & pk_list & 'fname!="start_time"' & + (alyxraw.AlyxRaw & 'model="actions.session"')).delete_quick() + + +def delete_entries_from_membership(pks_to_be_deleted): + ''' + Delete entries from shadow membership membership_tables + ''' + for t in membership_tables: + ingest_mod = t['dj_parent_table'].__module__ + table_name = t['dj_parent_table'].__name__ + + mem_table_name = t['dj_current_table'].__name__ + + print(f'Deleting from table {mem_table_name} ...') + real_table = eval(ingest_mod.replace('ibl_pipeline.ingest.', '') + '.' + table_name) + + (t['dj_current_table'] & + (real_table & + [{t['dj_parent_uuid_name']:pk} + for pk in pks_to_be_deleted if is_valid_uuid(pk)]).fetch('KEY')).delete() + + +# =================================== functions for update ========================================== + +TABLES_TO_UPDATE = [ + {'real_schema': reference, + 'shadow_schema': reference_ingest, + 'table_name': 'Project', + 'members': [] + }, + {'real_schema': subject, + 'shadow_schema': subject_ingest, + 'table_name': 'Subject', + 'members': ['SubjectLab', 'SubjectUser', 'SubjectProject', 'Death'] + }, + {'real_schema': action, + 'shadow_schema': action_ingest, + 'table_name': 'Weighing', + 'members': [] + }, + {'real_schema': action, + 'shadow_schema': action_ingest, + 'table_name': 'WaterRestriction', + 'members': [] + }, + {'real_schema': action, + 'shadow_schema': action_ingest, + 'table_name': 'WaterAdministration', + 'members': [] + }, + {'real_schema': acquisition, + 'shadow_schema': acquisition_ingest, + 'table_name': 'Session', + 'members': ['SessionUser', 'SessionProject'] + } +] + + +def update_fields(real_schema, shadow_schema, table_name, pks, insert_to_table=False): + ''' + Given a table and the primary key of real table, update all the fields that have discrepancy. + Inputs: real_schema : real schema module, e.g. reference + shadow_schema : shadow schema module, e.g. reference_ingest + table_name : string, name of a table, e.g. Subject + pks : list of dictionaries, primary keys of real table that contains modification + insert_to_table : boolean, if True, log the update histolory in the table ibl_update.UpdateRecord + ''' + + real_table = getattr(real_schema, table_name) + shadow_table = getattr(shadow_schema, table_name) + + secondary_fields = set(real_table.heading.secondary_attributes) + ts_field = [f for f in secondary_fields + if f.endswith('_ts')][0] + fields_to_update = secondary_fields - {ts_field} + + for r in (real_table & pks).fetch('KEY'): + + pk_hash = UUID(dj.hash.hash_key_values(r)) + + if not shadow_table & r: + real_record = (real_table & r).fetch1() + if insert_to_table: + update_record = dict( + table=real_table.__module__ + '.' + real_table.__name__, + attribute='unknown', + pk_hash=pk_hash, + original_ts=real_record[ts_field], + update_ts=datetime.datetime.now(), + pk_dict=r, + ) + update.UpdateRecord.insert1(update_record) + update_record.pop('pk_dict') + + update_error_msg = 'Record does not exist in the shadow {}'.format(r) + update_record_error = dict( + **update_record, + update_action_ts=datetime.datetime.now(), + update_error_msg=update_error_msg + ) + update.UpdateError.insert1(update_record_error) + + print(update_error_msg) + continue + + shadow_record = (shadow_table & r).fetch1() + real_record = (real_table & r).fetch1() + + for f in fields_to_update: + if real_record[f] != shadow_record[f]: + try: + (real_table & r)._update(f, shadow_record[f]) + update_narrative = f'{table_name}.{f}: {shadow_record[f]} != {real_record[f]}' + print(update_narrative) + if insert_to_table: + update_record = dict( + table=real_table.__module__ + '.' + real_table.__name__, + attribute=f, + pk_hash=pk_hash, + original_ts=real_record[ts_field], + update_ts=shadow_record[ts_field], + pk_dict=r, + original_value=real_record[f], + updated_value=shadow_record[f], + update_narrative=update_narrative + ) + update.UpdateRecord.insert1(update_record) + + except BaseException as e: + print(f'Error while updating record {r}: {str(e)}') + + +def update_entries_from_real_tables(modified_pks): + + for table in TABLES_TO_UPDATE: + + print('Updating {}...'.format(table['table_name'])) + t = table.copy() + table = getattr(t['real_schema'], t['table_name']) + + if t['table_name'] == 'Subject': + uuid_field = 'subject_uuid' + else: + uuid_field = next(f for f in table.heading.secondary_attributes + if '_uuid' in f and 'subject' not in f) + + pks_important = get_important_pks(modified_pks) + + query = table & [{uuid_field: pk} for pk in pks_important] + + if query: + members = t.pop('members') + update_fields(**t, pks=query.fetch('KEY'), insert_to_table=True) + + if members: + for m in members: + sub_t = getattr(t['real_schema'], m) + if sub_t & query: + update_fields(t['real_schema'], t['shadow_schema'], + m, (sub_t & query).fetch('KEY'), + insert_to_table=True) + + + +if __name__ == '__main__': + + dj.config['safemode'] = False + + deleted_pks, modified_pks, modified_pks_important = \ + (job.Job & 'job_date="2020-09-04"').fetch1( + 'deleted_pks', 'modified_pks', 'modified_pks_important') + + delete_entries_from_alyxraw(deleted_pks+modified_pks, modified_pks_important) diff --git a/ibl_pipeline/process/ingest_alyx_raw.py b/ibl_pipeline/process/ingest_alyx_raw.py new file mode 100755 index 00000000..3df87ad9 --- /dev/null +++ b/ibl_pipeline/process/ingest_alyx_raw.py @@ -0,0 +1,167 @@ +''' +This script load the json dump and insert the tuples into the alyxraw table. +''' + +import datajoint as dj +import json +import logging +import math +import collections +import os.path as path +from ibl_pipeline.ingest import alyxraw, InsertBuffer +import sys +import uuid +import re +from tqdm import tqdm +import numpy as np + + +logger = logging.getLogger(__name__) + + +def get_alyx_entries(filename=None, models=None, + exclude=None, new_pks=None): + + exclude_list = {'auth.group', 'sessions.session', + 'authtoken.token', + 'experiments.brainregion', + 'actions.notificationrule', + 'misc.note', + 'jobs.task', + 'actions.notificationrule', + 'actions.notifications' + } + if exclude: + exclude_list = exclude_list.union(set(exclude)) + + if not filename: + filename = path.join('/', 'data', 'alyxfull.json') + + with open(filename, 'r') as fid: + keys_all = json.load(fid) + + print('Creating entries to insert into alyxraw...') + if not models: + if new_pks: + return [key for key in tqdm(keys_all) if key['model'] not in exclude_list and key['pk'] in new_pks] + else: + return [key for key in keys_all if key['model'] not in exclude_list] + elif isinstance(models, str): + if new_pks: + return [key for key in keys_all if key['model'] == models and key['pk'] in new_pks] + + else: + return [key for key in keys_all if key['model'] == models] + elif isinstance(models, list): + if new_pks: + return [key for key in keys_all if key['model'] in models and key['pk'] in new_pks] + else: + return [key for key in keys_all if key['model'] in models] + else: + raise ValueError('models should be a str, list or numpy array') + + +def insert_to_alyxraw( + keys, alyxraw_module=alyxraw, + alyx_type='all'): + + # use insert buffer to speed up the insertion process + if alyx_type in ('all', 'main'): + + ib_main = InsertBuffer(alyxraw_module.AlyxRaw) + # insert into AlyxRaw table + for key in tqdm(keys, position=0): + try: + pk = uuid.UUID(key['pk']) + except Exception: + print('Error for key: {}'.format(key)) + continue + + ib_main.insert1(dict(uuid=pk, model=key['model'])) + if ib_main.flush(skip_duplicates=True, chunksz=10000): + logger.debug('Inserted 10000 raw tuples.') + + if ib_main.flush(skip_duplicates=True): + logger.debug('Inserted remaining raw tuples') + ib_main = InsertBuffer(alyxraw_module.AlyxRaw) + + if alyx_type in ('all', 'part'): + ib_part = InsertBuffer(alyxraw_module.AlyxRaw.Field) + # insert into the part table AlyxRaw.Field + for ikey, key in tqdm(enumerate(keys), position=0): + try: + try: + pk = uuid.UUID(key['pk']) + except ValueError: + print('Error for key: {}'.format(key)) + continue + + key_field = dict(uuid=uuid.UUID(key['pk'])) + for field_name, field_value in key['fields'].items(): + key_field = dict(key_field, fname=field_name) + + if field_name == 'json' and field_value is not None: + + key_field['value_idx'] = 0 + key_field['fvalue'] = json.dumps(field_value) + if len(key_field['fvalue']) < 10000: + ib_part.insert1(key_field) + else: + continue + if field_name == 'narrative' and field_value is not None: + # filter out emoji + emoji_pattern = re.compile( + "[" + u"\U0001F600-\U0001F64F" # emoticons + u"\U0001F300-\U0001F5FF" # symbols & pictographs + u"\U0001F680-\U0001F6FF" # transport & map symbols + u"\U0001F1E0-\U0001F1FF" # flags (iOS) + u"\U00002702-\U000027B0" + u"\U000024C2-\U0001F251" + "]+", flags=re.UNICODE) + + key_field['value_idx'] = 0 + key_field['fvalue'] = emoji_pattern.sub(r'', field_value) + + elif field_value is None or field_value == '' or field_value == [] or \ + (isinstance(field_value, float) and math.isnan(field_value)): + key_field['value_idx'] = 0 + key_field['fvalue'] = 'None' + ib_part.insert1(key_field) + + elif type(field_value) is list and \ + (type(field_value[0]) is dict or type(field_value[0]) is str): + for value_idx, value in enumerate(field_value): + key_field['value_idx'] = value_idx + key_field['fvalue'] = str(value) + ib_part.insert1(key_field) + else: + key_field['value_idx'] = 0 + key_field['fvalue'] = str(field_value) + ib_part.insert1(key_field) + + if ib_part.flush(skip_duplicates=True, chunksz=10000): + logger.debug('Inserted 10000 raw field tuples') + + except Exception: + print('Problematic entry:{}'.format(ikey)) + raise + + if ib_part.flush(skip_duplicates=True): + logger.debug('Inserted all remaining raw field tuples') + + +if __name__ == '__main__': + + if len(sys.argv) < 2: # no arguments given + # if no argument given, assume a canonical file location and name + filename = path.join('/', 'data', 'alyxfull.json') + else: + filename = path.join(dir_name, sys.argv[1]) + + new_pks_file = path.join('/', 'data', 'created_pks.json') + + with open(new_pks_file, 'r') as fid: + new_pks = json.load(fid) + + insert_to_alyxraw(get_alyx_entries(filename, new_pks=new_pks)) diff --git a/ibl_pipeline/process/ingest_membership.py b/ibl_pipeline/process/ingest_membership.py new file mode 100755 index 00000000..75e9c21b --- /dev/null +++ b/ibl_pipeline/process/ingest_membership.py @@ -0,0 +1,242 @@ +''' +This script inserts membership tuples into the membership shadow tables, \ +which cannot be inserted with auto-population. +''' + +import datajoint as dj +import json +import uuid +from ibl_pipeline.ingest import alyxraw, reference, subject, action, acquisition, data +from ibl_pipeline.ingest import get_raw_field as grf +from ibl_pipeline.utils import is_valid_uuid + + +membership_tables = [ + {'dj_current_table': reference.ProjectLabMember, + 'alyx_parent_model': 'subjects.project', + 'alyx_field': 'users', + 'dj_parent_table': reference.Project, + 'dj_other_table': reference.LabMember, + 'dj_parent_fields': 'project_name', + 'dj_other_field': 'user_name', + 'dj_parent_uuid_name': 'project_uuid', + 'dj_other_uuid_name': 'user_uuid'}, + {'dj_current_table': subject.AlleleSequence, + 'alyx_parent_model': 'subjects.allele', + 'alyx_field': 'sequences', + 'dj_parent_table': subject.Allele, + 'dj_other_table': subject.Sequence, + 'dj_parent_fields': 'allele_name', + 'dj_other_field': 'sequence_name', + 'dj_parent_uuid_name': 'allele_uuid', + 'dj_other_uuid_name': 'sequence_uuid'}, + {'dj_current_table': subject.LineAllele, + 'alyx_parent_model': 'subjects.line', + 'alyx_field': 'alleles', + 'dj_parent_table': subject.Line, + 'dj_other_table': subject.Allele, + 'dj_parent_fields': 'line_name', + 'dj_other_field': 'allele_name', + 'dj_parent_uuid_name': 'line_uuid', + 'dj_other_uuid_name': 'allele_uuid'}, + {'dj_current_table': action.WaterRestrictionUser, + 'alyx_parent_model': 'actions.waterrestriction', + 'alyx_field': 'users', + 'dj_parent_table': action.WaterRestriction, + 'dj_other_table': reference.LabMember, + 'dj_parent_fields': ['subject_uuid', 'restriction_start_time'], + 'dj_other_field': 'user_name', + 'dj_parent_uuid_name': 'restriction_uuid', + 'dj_other_uuid_name': 'user_uuid'}, + {'dj_current_table': action.WaterRestrictionProcedure, + 'alyx_parent_model': 'actions.waterrestriction', + 'alyx_field': 'procedures', + 'dj_parent_table': action.WaterRestriction, + 'dj_other_table': action.ProcedureType, + 'dj_parent_fields': ['subject_uuid', 'restriction_start_time'], + 'dj_other_field': 'procedure_type_name', + 'dj_parent_uuid_name': 'restriction_uuid', + 'dj_other_uuid_name': 'procedure_type_uuid'}, + {'dj_current_table': action.SurgeryUser, + 'alyx_parent_model': 'actions.surgery', + 'alyx_field': 'users', + 'dj_parent_table': action.Surgery, + 'dj_other_table': reference.LabMember, + 'dj_parent_fields': ['subject_uuid', 'surgery_start_time'], + 'dj_other_field': 'user_name', + 'dj_parent_uuid_name': 'surgery_uuid', + 'dj_other_uuid_name': 'user_uuid'}, + {'dj_current_table': action.SurgeryProcedure, + 'alyx_parent_model': 'actions.surgery', + 'alyx_field': 'procedures', + 'dj_parent_table': action.Surgery, + 'dj_other_table': action.ProcedureType, + 'dj_parent_fields': ['subject_uuid', 'surgery_start_time'], + 'dj_other_field': 'procedure_type_name', + 'dj_parent_uuid_name': 'surgery_uuid', + 'dj_other_uuid_name': 'procedure_type_uuid'}, + {'dj_current_table': action.OtherActionUser, + 'alyx_parent_model': 'actions.otheractions', + 'alyx_field': 'users', + 'dj_parent_table': action.OtherAction, + 'dj_other_table': reference.LabMember, + 'dj_parent_fields': ['subject_uuid', 'other_action_start_time'], + 'dj_other_field': 'user_name', + 'dj_parent_uuid_name': 'other_action_uuid', + 'dj_other_uuid_name': 'user_uuid'}, + {'dj_current_table': action.OtherActionProcedure, + 'alyx_parent_model': 'actions.otheractions', + 'alyx_field': 'procedures', + 'dj_parent_table': action.OtherAction, + 'dj_other_table': action.ProcedureType, + 'dj_parent_fields': ['subject_uuid', 'other_action_start_time'], + 'dj_other_field': 'procedure_type_name', + 'dj_parent_uuid_name': 'other_action_uuid', + 'dj_other_uuid_name': 'procedure_type_uuid'}, + {'dj_current_table': acquisition.ChildSession, + 'alyx_parent_model': 'actions.session', + 'alyx_field': 'parent_session', + 'dj_parent_table': acquisition.Session, + 'dj_other_table': acquisition.Session, + 'dj_parent_fields': ['subject_uuid', 'session_start_time'], + 'dj_other_field': 'session_start_time', + 'dj_parent_uuid_name': 'session_uuid', + 'dj_other_uuid_name': 'session_uuid', + 'renamed_other_field_name': 'parent_session_start_time'}, + {'dj_current_table': acquisition.SessionUser, + 'alyx_parent_model': 'actions.session', + 'alyx_field': 'users', + 'dj_parent_table': acquisition.Session, + 'dj_other_table': reference.LabMember, + 'dj_parent_fields': ['subject_uuid', 'session_start_time'], + 'dj_other_field': 'user_name', + 'dj_parent_uuid_name': 'session_uuid', + 'dj_other_uuid_name': 'user_uuid'}, + {'dj_current_table': acquisition.SessionProcedure, + 'alyx_parent_model': 'actions.session', + 'alyx_field': 'procedures', + 'dj_parent_table': acquisition.Session, + 'dj_other_table': action.ProcedureType, + 'dj_parent_fields': ['subject_uuid', 'session_start_time'], + 'dj_other_field': 'procedure_type_name', + 'dj_parent_uuid_name': 'session_uuid', + 'dj_other_uuid_name': 'procedure_type_uuid'}, + {'dj_current_table': acquisition.SessionProject, + 'alyx_parent_model': 'actions.session', + 'alyx_field': 'project', + 'dj_parent_table': acquisition.Session, + 'dj_other_table': reference.Project, + 'dj_parent_fields': ['subject_uuid', 'session_start_time'], + 'dj_other_field': 'project_name', + 'dj_parent_uuid_name': 'session_uuid', + 'dj_other_uuid_name': 'project_uuid', + 'renamed_other_field_name': 'session_project'}, + {'dj_current_table': acquisition.WaterAdministrationSession, + 'alyx_parent_model': 'actions.wateradministration', + 'alyx_field': 'session', + 'dj_parent_table': action.WaterAdministration, + 'dj_other_table': acquisition.Session, + 'dj_parent_fields': ['subject_uuid', 'administration_time'], + 'dj_other_field': 'session_start_time', + 'dj_parent_uuid_name': 'wateradmin_uuid', + 'dj_other_uuid_name': 'session_uuid'}, + {'dj_current_table': data.ProjectRepository, + 'alyx_parent_model': 'subjects.project', + 'alyx_field': 'repositories', + 'dj_parent_table': reference.Project, + 'dj_other_table': data.DataRepository, + 'dj_parent_fields': 'project_name', + 'dj_other_field': 'session_start_time', + 'dj_parent_uuid_name': 'project_uuid', + 'dj_other_uuid_name': 'repo_uuid'}, +] + + +def main(new_pks=None, excluded_tables=[]): + for tab_args in membership_tables: + table_name = tab_args['dj_current_table'].__name__ + if table_name in excluded_tables: + continue + print(f'Ingesting table {table_name}...') + ingest_membership_table(**tab_args, new_pks=new_pks) + + +def ingest_membership_table(dj_current_table, + alyx_parent_model, + alyx_field, + dj_parent_table, dj_other_table, + dj_parent_fields, dj_other_field, + dj_parent_uuid_name, dj_other_uuid_name, + renamed_other_field_name=None, + new_pks=None): + ''' + Ingest shadow membership table. + This function works for the pattern that an alyx parent model contain one or multiple entries of one field + that have the information in the membership table. + + + Arguments: dj_current_table : datajoint table object, current membership table to ingest + alyx_parent_model: string, model name inside alyx that contains information of the current table. + alyx_field : field of alyx that contains information of current table + dj_parent_table : datajoint parent table, corresponding to alyx parent model + dj_other_table : datajoint other table to fetch the field from + dj_parent_fields : string or list of strings, field names to be fetched from the parent table + dj_other_field : string, the field table to be fetched from the other table + dj_parent_uuid_name: string, uuid id name of the parent table + dj_other_uuid_name: string, uuid id name of the other table + renamed_other_field_name: string the other field name sometimes renamed in the real table, + the default is None if the field is not renamed + new_pks : list of strings of valid uuids, this is the new entries to process, the + default is None if all entries are inserted. + ''' + if new_pks: + restr = [{'uuid': pk} for pk in new_pks if is_valid_uuid(pk)] + else: + restr = {} + + alyxraw_to_insert = (alyxraw.AlyxRaw & restr & + {'model': alyx_parent_model}).fetch('KEY') + + if not alyxraw_to_insert: + return + + alyx_field_entries = alyxraw.AlyxRaw.Field & alyxraw_to_insert & \ + {'fname': alyx_field} & 'fvalue!="None"' + + keys = (alyxraw.AlyxRaw & alyx_field_entries).proj(**{dj_parent_uuid_name: 'uuid'}) + + if type(dj_parent_fields) == str: + dj_parent_fields = [dj_parent_fields] + + for key in keys: + + if not dj_parent_table & key: + print(f'The entry {key} is not parent table {dj_parent_table.__name__}') + continue + + entry_base = (dj_parent_table & key).fetch(*dj_parent_fields, as_dict=True)[0] + + key['uuid'] = key[dj_parent_uuid_name] + uuids = grf(key, alyx_field, multiple_entries=True, + model=alyx_parent_model) + if len(uuids): + for uuid in uuids: + if uuid == 'None': + continue + else: + if not dj_other_table & {dj_other_uuid_name: uuid}: + print(f'The uuid {uuid} is not datajoint table {dj_other_table.__name__}') + continue + entry = entry_base.copy() + field_value = (dj_other_table & {dj_other_uuid_name: uuid}).fetch1(dj_other_field) + if renamed_other_field_name: + entry[renamed_other_field_name] = field_value + else: + entry[dj_other_field] = field_value + + dj_current_table.insert1(entry, skip_duplicates=True) + + +if __name__ == '__main__': + + main() diff --git a/ibl_pipeline/process/ingest_real.py b/ibl_pipeline/process/ingest_real.py new file mode 100755 index 00000000..bca79e05 --- /dev/null +++ b/ibl_pipeline/process/ingest_real.py @@ -0,0 +1,175 @@ +''' +This script copies tuples in the shadow tables into the real tables for alyx. +''' + +import datajoint as dj +from ibl_pipeline.ingest.common import * +from ibl_pipeline.common import * +import traceback + +REF_TABLES = ( + 'Lab', + 'LabMember', + 'LabMembership', + 'LabLocation', + 'Project', + 'ProjectLabMember', + 'CoordinateSystem' +) + +SUBJECT_TABLES = ( + 'Species', + 'Strain', + 'Source', + 'Sequence', + 'Allele', + 'AlleleSequence', + 'Line', + 'LineAllele', + 'Subject', + 'SubjectUser', + 'SubjectProject', + 'SubjectLab', + 'BreedingPair', + 'Litter', + 'LitterSubject', + 'Weaning', + 'Death', + 'SubjectCullMethod', + 'Caging', + 'UserHistory', + 'GenotypeTest', + 'Zygosity', + 'Implant', + 'Food', + 'CageType', + 'Enrichment', + 'Housing', + 'SubjectHousing' +) + +ACTION_TABLES = ( + 'ProcedureType', + 'Weighing', + 'WaterType', + 'WaterAdministration', + 'WaterRestriction', + 'WaterRestrictionUser', + 'WaterRestrictionProcedure', + 'Surgery', + 'SurgeryUser', + 'SurgeryProcedure', + 'OtherAction', + 'OtherActionUser', + 'OtherActionProcedure', + 'CullMethod', + 'CullReason', + 'Cull' +) + +ACQUISITION_TABLES = ( + 'Session', + 'ChildSession', + 'SessionUser', + 'SessionProcedure', + 'SessionProject', + 'WaterAdministrationSession' +) + +DATA_TABLES = ( + 'DataFormat', + 'DataRepositoryType', + 'DataRepository', + 'ProjectRepository', + 'DataSetType', + 'DataSet', + 'FileRecord' +) + +EPHYS_TABLES = ( + 'Probe', +) + + +def copy_table(target_schema, src_schema, table_name, + fresh=False, use_uuid=True, **kwargs): + if '.' in table_name: + attrs = table_name.split('.') + + target_table = target_schema + src_table = src_schema + for a in attrs: + target_table = getattr(target_table, a) + src_table = getattr(src_table, a) + else: + target_table = getattr(target_schema, table_name) + src_table = getattr(src_schema, table_name) + + if fresh: + target_table.insert(src_table, **kwargs) + else: + if use_uuid: + pk = src_table.heading.primary_key + if len(pk) == 1 and 'uuid' in pk[0]: + q_insert = src_table - (dj.U(pk[0]) & target_table & f'{pk[0]} is not null') + else: + q_insert = src_table - target_table.proj() + else: + q_insert = src_table - target_table.proj() + + try: + target_table.insert(q_insert, skip_duplicates=True, **kwargs) + + except Exception: + for t in (q_insert).fetch(as_dict=True): + try: + if table_name == 'DataSet' and \ + not len(t['dataset_created_by']): + t.pop('dataset_created_by') + target_table.insert1(t, skip_duplicates=True, **kwargs) + except Exception: + print("Error when inserting {}".format(t)) + traceback.print_exc() + + +def main(excluded_tables=[], public=False): + mods = [ + [reference, reference_ingest, REF_TABLES], + [subject, subject_ingest, SUBJECT_TABLES], + [action, action_ingest, ACTION_TABLES], + [acquisition, acquisition_ingest, ACQUISITION_TABLES], + [data, data_ingest, DATA_TABLES] + ] + + for (target, source, table_list) in mods: + for table in table_list: + if table in excluded_tables: + continue + print(table) + copy_table(target, source, table) + + if public: + return + + # ephys tables + table = 'ProbeModel' + print(table) + copy_table(ephys, ephys_ingest, table) + + table = 'ProbeInsertion' + print(table) + copy_table(ephys, ephys_ingest, table, allow_direct_insert=True) + + # histology tables + print('ProbeTrajectory') + histology.ProbeTrajectory.populate(suppress_errors=True, display_progress=True) + + print('ChannelBrainLocation') + copy_table(histology, histology_ingest, 'ChannelBrainLocation', + allow_direct_insert=True) + + +if __name__ == '__main__': + + dj.config['safemode'] = False + main() diff --git a/ibl_pipeline/process/ingest_shadow.py b/ibl_pipeline/process/ingest_shadow.py new file mode 100755 index 00000000..3ad1691a --- /dev/null +++ b/ibl_pipeline/process/ingest_shadow.py @@ -0,0 +1,234 @@ +import datajoint as dj +from ibl_pipeline.ingest import \ + (alyxraw, InsertBuffer, + reference, subject, action, acquisition, data) + +from os import environ + +mode = environ.get('MODE') +if mode != 'public': + from ibl_pipeline.ingest import ephys, histology + +from ibl_pipeline.ingest import get_raw_field as grf +import uuid +from tqdm import tqdm + + +SHADOW_TABLES = [ + reference.Lab, + reference.LabMember, + reference.LabMembership, + reference.LabLocation, + reference.Project, + reference.CoordinateSystem, + subject.Species, + subject.Source, + subject.Strain, + subject.Sequence, + subject.Allele, + subject.Line, + subject.Subject, + subject.BreedingPair, + subject.Litter, + subject.LitterSubject, + subject.SubjectProject, + subject.SubjectUser, + subject.SubjectLab, + subject.Caging, + subject.UserHistory, + subject.Weaning, + subject.Death, + subject.SubjectCullMethod, + subject.GenotypeTest, + subject.Zygosity, + action.ProcedureType, + action.Weighing, + action.WaterType, + action.WaterAdministration, + action.WaterRestriction, + action.Surgery, + action.OtherAction, + action.CullMethod, + action.CullReason, + action.Cull, + acquisition.Session, + data.DataFormat, + data.DataRepositoryType, + data.DataRepository, + data.DataSetType, + # data.DataSet, + # data.FileRecord, +] + +if mode != 'public': + SHADOW_TABLES = SHADOW_TABLES + [ + ephys.ProbeModel, + ephys.ProbeInsertion, + histology.ProbeTrajectory, + # histology.ChannelBrainLocation + ] + + +def main(excluded_tables=[], modified_pks=None): + + kwargs = dict( + display_progress=True, + suppress_errors=True) + + for t in SHADOW_TABLES: + if t.__name__ in excluded_tables: + continue + print(f'Ingesting shadow table {t.__name__}...') + + if t.__name__ == 'Session' and modified_pks: + modified_session_keys = [ + {'session_uuid': pk} for pk in modified_pks] + sessions = acquisition.Session & modified_session_keys + if sessions: + modified_session_entries = [] + for key in sessions.fetch('KEY'): + try: + entry = acquisition.Session.create_entry(key) + modified_session_entries.append(entry) + except: + print("Error creating entry for key: {}".format(key)) + if modified_session_entries: + t.insert(modified_session_entries, + allow_direct_insert=True, replace=True) + + t.populate(**kwargs) + + if 'DataSet' not in excluded_tables: + + print('Ingesting dataset entries...') + key_source = (alyxraw.AlyxRaw & 'model="data.dataset"').proj( + dataset_uuid="uuid") - data.DataSet + + data_set = InsertBuffer(data.DataSet) + + for key in tqdm(key_source.fetch('KEY'), position=0): + key_ds = key.copy() + key['uuid'] = key['dataset_uuid'] + + session = grf(key, 'session') + if not len(acquisition.Session & + dict(session_uuid=uuid.UUID(session))): + print('Session {} is not in the table acquisition.Session'.format( + session)) + print('dataset_uuid: {}'.format(str(key['uuid']))) + continue + + key_ds['subject_uuid'], key_ds['session_start_time'] = \ + (acquisition.Session & + dict(session_uuid=uuid.UUID(session))).fetch1( + 'subject_uuid', 'session_start_time') + + key_ds['dataset_name'] = grf(key, 'name') + + dt = grf(key, 'dataset_type') + key_ds['dataset_type_name'] = \ + (data.DataSetType & dict(dataset_type_uuid=uuid.UUID(dt))).fetch1( + 'dataset_type_name') + + user = grf(key, 'created_by') + + if user != 'None': + try: + key_ds['dataset_created_by'] = \ + (reference.LabMember & dict(user_uuid=uuid.UUID(user))).fetch1( + 'user_name') + except: + print(user) + else: + key_ds['dataset_created_by'] = None + + format = grf(key, 'data_format') + key_ds['format_name'] = \ + (data.DataFormat & dict(format_uuid=uuid.UUID(format))).fetch1( + 'format_name') + + key_ds['created_datetime'] = grf(key, 'created_datetime') + + software = grf(key, 'generating_software') + if software != 'None': + key_ds['generating_software'] = software + else: + key_ds['generating_software'] = None + + directory = grf(key, 'provenance_directory') + if directory != 'None': + key_ds['provenance_directory'] = directory + else: + key_ds['provenance_directory'] = None + + md5 = grf(key, 'md5') + if md5 != 'None': + key_ds['md5'] = md5 + else: + key_ds['md5'] = None + + file_size = grf(key, 'file_size') + if file_size != 'None': + key_ds['file_size'] = file_size + else: + key_ds['file_size'] = None + + data_set.insert1(key_ds) + + if data_set.flush( + skip_duplicates=True, + allow_direct_insert=True, chunksz=100): + print('Inserted 100 dataset tuples') + + if data_set.flush(skip_duplicates=True, allow_direct_insert=True): + print('Inserted all remaining dataset tuples') + + if 'FileRecord' not in excluded_tables: + print('Ingesting file record entries...') + records = alyxraw.AlyxRaw & 'model="data.filerecord"' + repos = (data.DataRepository & 'repo_name LIKE "flatiron%"').fetch( + 'repo_uuid') + records_flatiron = alyxraw.AlyxRaw.Field & records & \ + 'fname = "data_repository"' & [{'fvalue': str(repo)} for repo in repos] + record_exists = alyxraw.AlyxRaw.Field & records & \ + 'fname = "exists"' & 'fvalue="True"' + key_source = (alyxraw.AlyxRaw & record_exists & records_flatiron).proj( + record_uuid='uuid') - data.FileRecord + + file_record = InsertBuffer(data.FileRecord) + + for key in tqdm(key_source.fetch('KEY'), position=0): + key_fr = key.copy() + key['uuid'] = key['record_uuid'] + key_fr['exists'] = True + + dataset = grf(key, 'dataset') + if not len(data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))): + print('Dataset {} is not in the table data.DataSet') + print('Record_uuid: {}'.format(str(key['uuid']))) + continue + + key_fr['subject_uuid'], key_fr['session_start_time'], \ + key_fr['dataset_name'] = \ + (data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))).fetch1( + 'subject_uuid', 'session_start_time', 'dataset_name') + + repo = grf(key, 'data_repository') + key_fr['repo_name'] = \ + (data.DataRepository & dict(repo_uuid=uuid.UUID(repo))).fetch1( + 'repo_name') + + key_fr['relative_path'] = grf(key, 'relative_path') + + file_record.insert1(key_fr) + + if file_record.flush( + skip_duplicates=True, allow_direct_insert=True, chunksz=1000): + print('Inserted 1000 raw field tuples') + + if file_record.flush(skip_duplicates=True, allow_direct_insert=True): + print('Inserted all remaining file record tuples') + + +if __name__ == '__main__': + main() diff --git a/ibl_pipeline/process/populate_behavior.py b/ibl_pipeline/process/populate_behavior.py new file mode 100755 index 00000000..f1a08f67 --- /dev/null +++ b/ibl_pipeline/process/populate_behavior.py @@ -0,0 +1,154 @@ +''' +This script ingest behavioral data into tables in the ibl_behavior schema +''' + +from ibl_pipeline import behavior +from ibl_pipeline.analyses import behavior as behavior_analyses +from ibl_pipeline.plotting import behavior as behavior_plotting +import datetime +from ibl_pipeline import subject, reference, action +from tqdm import tqdm +from os import environ + +mode = environ.get('MODE') + +BEHAVIOR_TABLES = [ + behavior.CompleteWheelMoveSession, + behavior.CompleteTrialSession, + behavior.TrialSet, + behavior.AmbientSensorData, + behavior.Settings, + behavior.SessionDelay, + behavior_analyses.PsychResults, + behavior_analyses.PsychResultsBlock, + behavior_analyses.ReactionTime, + behavior_analyses.ReactionTimeContrastBlock, + behavior_analyses.SessionTrainingStatus, + behavior_analyses.BehavioralSummaryByDate, + behavior_plotting.SessionPsychCurve, + behavior_plotting.SessionReactionTimeContrast, + behavior_plotting.SessionReactionTimeTrialNumber, + behavior_plotting.DatePsychCurve, + behavior_plotting.DateReactionTimeContrast, + behavior_plotting.DateReactionTimeTrialNumber, + behavior_plotting.WaterTypeColor +] + + +def compute_latest_date(): + + for key in tqdm(subject.Subject.fetch('KEY'), position=0): + behavior_summary = behavior_analyses.BehavioralSummaryByDate & key + water_weight = action.Weighing * action.WaterAdministration & key + if behavior_summary: + latest_behavior = subject.Subject.aggr( + behavior_summary, + last_behavior_date='MAX(session_date)') + + if water_weight: + latest_weight = subject.Subject.aggr( + action.Weighing & key, + last_weighing_date='DATE(MAX(weighing_time))') + latest_water = subject.Subject.aggr( + action.WaterAdministration & key, + last_water_date='DATE(MAX(administration_time))') + + latest_water_weight = (latest_water * latest_weight).proj( + last_water_weight_date='GREATEST(last_water_date, \ + last_weighing_date)' + ) + + if not(behavior_summary or water_weight): + continue + elif behavior_summary and water_weight: + last_behavior_date = latest_behavior.fetch1( + 'last_behavior_date' + ) + last_water_weight_date = latest_water_weight.fetch1( + 'last_water_weight_date' + ) + latest_date = max([last_behavior_date, last_water_weight_date]) + elif behavior_summary: + latest_date = latest_behavior.fetch1( + 'last_behavior_date' + ) + elif water_weight: + latest_date = latest_water_weight.fetch1( + 'last_water_weight_date' + ) + + key['latest_date'] = latest_date + behavior_plotting.LatestDate.insert1(key) + + +def main(backtrack_days=None, excluded_tables=[]): + + kwargs = dict( + suppress_errors=True, display_progress=True) + + if backtrack_days: + date_cutoff = \ + (datetime.datetime.now().date() - + datetime.timedelta(days=backtrack_days)).strftime('%Y-%m-%d') + + for table in BEHAVIOR_TABLES: + + if table.__name__ in excluded_tables: + continue + print(f'Populating {table.__name__}...') + + if backtrack_days and table.__name__ != 'WaterTypeColor': + if 'Date' in table.__name__: + field = 'session_date' + else: + field = 'session_start_time' + restrictor = f'{field} > "{date_cutoff}"' + else: + restrictor = {} + + table.populate(restrictor, **kwargs) + + + print('Populating latest date...') + + compute_latest_date() + + latest = subject.Subject.aggr( + behavior_plotting.LatestDate, + checking_ts='MAX(checking_ts)') * behavior_plotting.LatestDate & \ + ['latest_date between curdate() - interval 30 day and curdate()', + (subject.Subject - subject.Death)] & \ + (subject.Subject & 'subject_nickname not like "%human%"').proj() + + subj_keys = (subject.Subject & behavior_plotting.CumulativeSummary & latest).fetch('KEY') + + # delete and repopulate subject by subject + for subj_key in tqdm(subj_keys, position=0): + (behavior_plotting.CumulativeSummary & subj_key & latest).delete() + behavior_plotting.CumulativeSummary.populate( + latest & subj_key, suppress_errors=True) + # --- update the latest date of the subject ----- + # get the latest date of the CumulativeSummary of the subject + subj_with_latest_date = (subject.Subject & subj_key).aggr( + behavior_plotting.CumulativeSummary, latest_date='max(latest_date)') + new_date = subj_with_latest_date.fetch1('latest_date') + current_subj = behavior_plotting.SubjectLatestDate & subj_key + if len(current_subj): + current_subj._update('latest_date', new_date) + else: + behavior_plotting.SubjectLatestDate.insert1( + subj_with_latest_date.fetch1()) + + behavior_plotting.CumulativeSummary.populate(**kwargs) + + print('Populating plotting.DailyLabSummary...') + last_sessions = (reference.Lab.aggr( + behavior_plotting.DailyLabSummary, + last_session_time='max(last_session_time)')).fetch('KEY') + (behavior_plotting.DailyLabSummary & last_sessions).delete() + behavior_plotting.DailyLabSummary.populate(**kwargs) + + +if __name__ == '__main__': + + main(backtrack_days=30) diff --git a/ibl_pipeline/process/populate_ephys.py b/ibl_pipeline/process/populate_ephys.py new file mode 100644 index 00000000..8bebad3a --- /dev/null +++ b/ibl_pipeline/process/populate_ephys.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 + +''' +This script tests the ingestion of ephys pipeline. +Shan Shen, 2019-11-20 + +Added a number of plotting tables. +Shan Shen, 2020-08-15 +''' + +from ibl_pipeline.common import * +import logging +import time + +import datetime +from uuid import UUID + +EPHYS_TABLES = [ + ephys.CompleteClusterSession, + ephys.DefaultCluster, + ephys.AlignedTrialSpikes, + ephys.GoodCluster, + ephys.ChannelGroup, + ephys_analyses.DepthPeth, + ephys_analyses.NormedDepthPeth, + histology.ClusterBrainRegion, + histology.SessionBrainRegion, + ephys_plotting.DepthRaster, + ephys_plotting.DepthPeth, + ephys_plotting.Raster, + ephys_plotting.Psth, + ephys_plotting.SpikeAmpTime, + ephys_plotting.AutoCorrelogram, + ephys_plotting.Waveform, + ephys_plotting.DepthRasterExampleTrial, +] + + +def main(exclude_plottings=False): + logging.basicConfig( + format='%(asctime)s - %(message)s', + handlers=[ + logging.FileHandler("ephys_ingestion.log"), + logging.StreamHandler()], + level=30) + + logger = logging.getLogger(__name__) + + kwargs = dict(display_progress=True, suppress_errors=True) + + start_time = time.time() + + for table in EPHYS_TABLES: + table_start_time = time.time() + if exclude_plottings and table.__module__ == 'ibl_pipeline.plotting.ephys': + continue + logger.log(30, 'Ingesting {}...'.format(table.__name__)) + table.populate(**kwargs) + logger.log(30, 'Ingestion time of {} is {}'.format( + table.__name__, + time.time()-table_start_time)) + + end_time = time.time() + logger.log(30, 'Total ingestion time {}'.format( + end_time-start_time + )) + + +if __name__ == '__main__': + + main() diff --git a/ibl_pipeline/process/process_histology.py b/ibl_pipeline/process/process_histology.py new file mode 100644 index 00000000..00354503 --- /dev/null +++ b/ibl_pipeline/process/process_histology.py @@ -0,0 +1,189 @@ +from ibl_pipeline.process import ingest_alyx_raw, ingest_real +from ibl_pipeline.ingest.common import * +from ibl_pipeline.ingest import populate_batch, InsertBuffer +from ibl_pipeline.common import * +from ibl_pipeline.process import update_utils +from tqdm import tqdm +import inspect + + +ALYX_HISTOLOGY_MODELS = [ + 'misc.lab', 'misc.labmember', 'misc.labmembership', 'misc.lablocation', + 'subjects.project', 'subjects.species', 'subjects.strain', 'subjects.source', + 'subjects.allele', 'subjects.sequence', 'subjects.subject', + 'actions.proceduretype', 'actions.wateradministration', 'actions.session', + 'experiments.probemodel', + 'experiments.probeinsertion', 'experiments.coordinatesystem', + 'experiments.trajectoryestimate', 'experiments.channel'] + +HISTOLOGY_SHADOW_TABLES = [ + reference_ingest.Lab, + reference_ingest.LabMember, + reference_ingest.LabMembership, + reference_ingest.LabLocation, + reference_ingest.Project, + reference_ingest.CoordinateSystem, + subject_ingest.Species, + subject_ingest.Source, + subject_ingest.Strain, + subject_ingest.Sequence, + subject_ingest.Allele, + subject_ingest.Line, + subject_ingest.Subject, + subject_ingest.SubjectProject, + subject_ingest.SubjectUser, + subject_ingest.SubjectLab, + subject_ingest.UserHistory, + action_ingest.ProcedureType, + action_ingest.WaterAdministration, + acquisition_ingest.Session, + ephys_ingest.ProbeModel, + ephys_ingest.ProbeInsertion, + histology_ingest.ProbeTrajectoryTemp, + histology_ingest.ChannelBrainLocationTemp +] + +HISTOLOGY_TABLES_FOR_DELETE = [ + histology.ProbeBrainRegionTemp, + histology.ClusterBrainRegionTemp, + histology.ChannelBrainLocationTemp, + histology.ProbeTrajectoryTemp, +] + +HISTOLOGY_TABLES_FOR_POPULATE = [ + histology.ClusterBrainRegionTemp, + histology.ProbeBrainRegionTemp, +] + + +def process_alyxraw_histology( + filename='/data/alyxfull.json', models=ALYX_HISTOLOGY_MODELS): + + ''' + Ingest all histology entries in a particular alyx dump, regardless of the current status. + ''' + ingest_alyx_raw.insert_to_alyxraw( + ingest_alyx_raw.get_alyx_entries( + filename=filename, + models=models + ) + ) + + +def populate_shadow_tables(): + + kwargs = dict( + display_progress=True, + suppress_errors=True) + + for t in HISTOLOGY_SHADOW_TABLES: + + print(f'Populating {t.__name__}...') + if t.__name__ == 'ChannelBrainLocationTemp': + populate_batch(t) + else: + t.populate(**kwargs) + + +def delete_histology_alyx_shadow(verbose=False): + + CHANNEL_TABLES = [ + histology_ingest.ChannelBrainLocationTemp, + histology_ingest.ChannelBrainLocation, + alyxraw.AlyxRaw.Field, + alyxraw.AlyxRaw + ] + + channel_loc_keys = update_utils.get_deleted_keys('experiments.channel') + for t in CHANNEL_TABLES: + print(f'Deleting from table {t.__name__}') + uuid_name = t.heading.primary_key[0] + keys = [{uuid_name: k['uuid']} for k in tqdm(channel_loc_keys)] + table = InsertBuffer(t) + + for k in tqdm(keys, position=0): + table.delete1(k) + if table.flush_delete(chunksz=1000, quick=True) and verbose: + print(f'Deleted 1000 entries from {t.__name__}') + + table.flush_delete(quick=True) + + traj_keys = update_utils.get_deleted_keys('experiments.trajectoryestimate') + \ + update_utils.get_updated_keys('experiments.trajectoryestimate') + + TRAJ_TABLES = [ + histology_ingest.ProbeTrajectoryTemp, + histology_ingest.ProbeTrajectory, + alyxraw.AlyxRaw.Field, + alyxraw.AlyxRaw + ] + + for t in TRAJ_TABLES: + uuid_name = t.heading.primary_key[0] + keys = [{uuid_name: k['uuid']} for k in traj_keys] + table = InsertBuffer(t) + for k in tqdm(keys, position=0): + table.delete1(k) + if table.flush_delete(chunksz=1000, quick=True) and verbose: + print(f'Deleted 1000 entries from {t.__name__}') + table.flush_delete(quick=True) + + +def delete_histology_real(): + + traj_uuids = update_utils.get_deleted_keys('experiments.trajectoryestimate') + \ + update_utils.get_updated_keys('experiments.trajectoryestimate') + + traj_uuids_real = [ + {'probe_trajectory_uuid': k['uuid']} for k in traj_uuids] + + traj_keys = (histology.ProbeTrajectoryTemp & traj_uuids_real).fetch('KEY') + + for t in HISTOLOGY_TABLES_FOR_DELETE: + print(f'Deleting from table {t.__name__}') + (t & traj_keys).delete_quick() + + +def copy_to_real_tables(): + + for shadow_table in HISTOLOGY_SHADOW_TABLES: + + mod = shadow_table.__module__ + shadow_module = inspect.getmodule(shadow_table) + real_module = eval(mod.replace('ibl_pipeline.ingest.', '')) + table_name = shadow_table.__name__ + print(f'Copying table {table_name}...') + ingest_real.copy_table( + real_module, shadow_module, table_name, allow_direct_insert=True) + + +def populate_real_tables(): + + for t in HISTOLOGY_TABLES_FOR_POPULATE: + print(f'Populating {t.__name__}...') + t.populate(suppress_errors=True, display_progress=True) + + +def main(fpath='/data/alyxfull.json'): + + print('Insert to update alyxraw...') + update_utils.insert_to_update_alyxraw( + filename=fpath, delete_tables=True, + models=['experiments.trajectoryestimate', 'experiments.channel']) + print('Deleting from alyx and shadow...') + delete_histology_alyx_shadow() + print('Ingesting new alyxraw...') + process_alyxraw_histology(filename=fpath) + print('Populating new shadow...') + populate_shadow_tables() + print('Deleting real table entries...') + delete_histology_real() + print('Copying to real tables...') + copy_to_real_tables() + print('Populating real tables...') + populate_real_tables() + + +if __name__ == '__main__': + + main() diff --git a/ibl_pipeline/process/process_qc.py b/ibl_pipeline/process/process_qc.py new file mode 100644 index 00000000..f59fb304 --- /dev/null +++ b/ibl_pipeline/process/process_qc.py @@ -0,0 +1,95 @@ +from ibl_pipeline.process import update_utils, ingest_alyx_raw +from ibl_pipeline.ingest import alyxraw +from ibl_pipeline import acquisition, qc +from ibl_pipeline.ingest import qc as qc_ingest +from ibl_pipeline.process.ingest_real import copy_table +import logging + +logging.basicConfig( + format='%(asctime)s - %(message)s', + handlers=[ + logging.FileHandler("/src/IBL-pipeline/ibl_pipeline/process/logs/process_qc.log"), + logging.StreamHandler()], + level=25) + +logger = logging.getLogger(__name__) + +alyx_model = 'actions.session' + + +def delete_qc_entries(): + + qc_keys = update_utils.get_deleted_keys(alyx_model) + \ + update_utils.get_updated_keys(alyx_model, fields=['qc', 'extended_qc']) + + logger.log(25, 'Deleting updated qc and extended_qc from alyxraw...') + (alyxraw.AlyxRaw.Field & + 'fname in ("qc", "extended_qc")' & qc_keys).delete_quick() + + logger.log(25, 'Deleting updated qc and extended_qc from shadow tables') + session_uuids = [{'session_uuid': k['uuid']} for k in qc_keys] + sessions = acquisition.Session & session_uuids + (qc_ingest.SessionQCIngest & session_uuids).delete_quick() + (qc_ingest.SessionQC & sessions).delete_quick() + (qc_ingest.SessionExtendedQC.Field & sessions).delete_quick() + (qc_ingest.SessionExtendedQC & sessions).delete_quick() + + logger.log(25, 'Deleting updated qc and extended_qc from real tables') + (qc.SessionExtendedQC.Field & sessions).delete_quick() + (qc.SessionExtendedQC & sessions).delete_quick() + (qc.SessionQC & sessions).delete_quick() + + +def process_alyxraw_qc( + filename='/data/alyxfull.json', + models=['actions.session']): + ''' + Ingest all qc entries in a particular alyx dump, regardless of the current status. + ''' + + ingest_alyx_raw.insert_to_alyxraw( + ingest_alyx_raw.get_alyx_entries( + filename=filename, + models=models + ), + alyx_type='part' + ) + + +def ingest_shadow_tables(): + + qc_ingest.SessionQCIngest.populate( + display_progress=True, suppress_errors=True) + + +def ingest_real_tables(): + + QC_TABLES = ['SessionQC', 'SessionExtendedQC', 'SessionExtendedQC.Field'] + + for t in QC_TABLES: + copy_table(qc, qc_ingest, t) + + +def main(fpath='/data/alyxfull.json'): + + logger.log(25, 'Insert to update alyxraw...') + update_utils.insert_to_update_alyxraw( + filename=fpath, delete_tables=True, + models=['actions.session']) + + logger.log(25, 'Deleting updated entries...') + delete_qc_entries() + + logger.log(25, 'Ingesting Alyxraw for QC...') + process_alyxraw_qc() + + logger.log(25, 'Ingesting QC shadow tables...') + ingest_shadow_tables() + + logger.log(25, 'Copying real tables...') + ingest_real_tables() + + +if __name__ == '__main__': + + main() diff --git a/ibl_pipeline/process/update_utils.py b/ibl_pipeline/process/update_utils.py new file mode 100644 index 00000000..9f817822 --- /dev/null +++ b/ibl_pipeline/process/update_utils.py @@ -0,0 +1,71 @@ +import datajoint as dj +from tqdm import tqdm +from ibl_pipeline.process import ingest_alyx_raw + +alyxraw = dj.create_virtual_module( + 'alyxraw', dj.config.get('database.prefix', '') + 'ibl_alyxraw') +alyxraw_update = dj.create_virtual_module( + 'alyxraw', 'update_ibl_alyxraw', create_schema=True) + + +def insert_to_update_alyxraw( + filename=None, delete_tables=False, models=None): + + dj.config['safemode'] = False + + if not models: + raise ValueError('Argument models is required, \ + str of an alyx model or a list of alyx models') + + if delete_tables: + + print('Deleting alyxraw update...') + alyxraw_update.AlyxRaw.Field.delete_quick() + alyxraw_update.AlyxRaw.delete_quick() + + ingest_alyx_raw.insert_to_alyxraw( + ingest_alyx_raw.get_alyx_entries( + filename=filename, + models=models), + alyxraw_module=alyxraw_update + ) + + +def get_deleted_keys(model): + return ((alyxraw.AlyxRaw - alyxraw_update.AlyxRaw.proj()) & + f'model="{model}"').fetch('KEY') + + +def get_updated_keys(model, fields=None): + + fields = alyxraw.AlyxRaw.Field & (alyxraw.AlyxRaw & f'model="{model}"') + fields_update = alyxraw_update.AlyxRaw.Field & \ + (alyxraw_update.AlyxRaw & f'model="{model}"') + + if fields: + fields_restr = {} + else: + fields_restr = [{'fname': f} for f in fields] + + return (alyxraw.AlyxRaw & + (fields_update.proj(fvalue_new='fvalue') * fields & + 'fvalue_new != fvalue' & 'fname not in ("json")' & fields_restr)).fetch('KEY') + + +def delete_from_alyxraw(keys): + + dj.config['safemode'] = False + + if len(keys) < 50: + (alyxraw.AlyxRaw.Field & keys).delete_quick() + (alyxraw.AlyxRaw & keys).delete() + else: + for key in tqdm(keys, position=0): + (alyxraw.AlyxRaw.Field & key).delete_quick() + (alyxraw.AlyxRaw & key).delete() + + +if __name__ == '__main__': + insert_to_update_alyxraw( + filename='/data/alyxfull_20201013_2222.json', + models=['experiments.trajectoryestimate', 'experiments.channel']) diff --git a/ibl_pipeline/public.py b/ibl_pipeline/public.py old mode 100644 new mode 100755 index bbac0859..c7bb6843 --- a/ibl_pipeline/public.py +++ b/ibl_pipeline/public.py @@ -2,6 +2,7 @@ import pandas as pd import datetime import re +from tqdm import tqdm from uuid import UUID from ibl_pipeline.ingest import alyxraw @@ -48,7 +49,17 @@ def make(self, key): self.insert1(dict(**key, subject_uuid=subject.fetch1('uuid'))) -if __name__ == "__main__": +@schema +class PublicSession(dj.Manual): + definition = """ + session_uuid : uuid + --- + subject_uuid=null : uuid + session_start_time=null : datetime + """ + + +def import_public_subjects(): subject_lists = pd.read_csv('/data/list_of_subjects_behavior_paper.csv') @@ -118,3 +129,19 @@ def make(self, key): PublicSubject.insert(subjs, skip_duplicates=True) PublicSubjectUuid.populate(display_progress=True) + + +def import_public_sessions(): + + sessions = pd.read_csv('/data/sessions.csv') + + session_uuids = sessions['0'].values + + PublicSession.insert( + [{'session_uuid': uuid} for uuid in tqdm(session_uuids)] + ) + + +if __name__ == "__main__": + + import_public_sessions() diff --git a/ibl_pipeline/qc.py b/ibl_pipeline/qc.py new file mode 100644 index 00000000..f6bab356 --- /dev/null +++ b/ibl_pipeline/qc.py @@ -0,0 +1,80 @@ +import datajoint as dj +from . import acquisition +import os + + +mode = os.environ.get('MODE') +if mode == 'update': + schema = dj.schema('ibl_qc') +else: + schema = dj.schema(dj.config.get('database.prefix', '') + 'ibl_qc') + + +@schema +class QCChoice(dj.Lookup): + definition = """ + # Available flags to quantify the quality of a session or a specific aspect of a session, lookup table got referred in SessionQC and SessionExtendedQC + qc : tinyint unsigned + --- + qc_label : varchar(32) + """ + + contents = [ + (0, 'NOT_SET'), + (10, 'PASS'), + (30, 'WARNING'), + (40, 'FAIL'), + (50, 'CRITICAL'), + ] + + +@schema +class SessionQC(dj.Manual): + definition = """ + # QCChoice for each session, ingested from alyx field qc in the table actions.session + -> acquisition.Session + --- + -> QCChoice + sessionqc_ts=CURRENT_TIMESTAMP: timestamp + """ + + +@schema +class QCType(dj.Lookup): + definition = """ + # Aspect of a session for quality check. e.g. task, behavior, experimenter… + qc_type : varchar(16) + --- + qc_type_description='' : varchar(1000) + """ + + contents = [ + ['experimenter', 'Manual labeling of a session by user'], + ['task', 'Quality check when running the task'], + ['behavior', 'Behavior criteria'], + ['video', 'Quality check for video recording'], + ['dlc', ''] + ] + + +@schema +class SessionExtendedQC(dj.Manual): + definition = """ + #QCChoice (e.g. FAIL) for a QCType (e.g. task) for each session, structured data about SessionQC + -> acquisition.Session + -> QCType + --- + -> QCChoice.proj(extended_qc='qc') + session_extended_qc_ts=CURRENT_TIMESTAMP: timestamp + """ + + class Field(dj.Part): + definition = """ + # Part table of SessionExtendedQC. For each entry of SessionExtendedQC, there may be multiple fields describing each value (e.g. 0.99) of a qc aspect (e.g. _task_stimOn_delays) that belongs to a QCType (e.g. task). + -> master + qc_fname : varchar(32) + --- + qc_fvalue_float=null : float + qc_fvalue_str=null : varchar(32) + qc_fvalue_blob=null : blob + """ diff --git a/ibl_pipeline/reference.py b/ibl_pipeline/reference.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/subject.py b/ibl_pipeline/subject.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/update.py b/ibl_pipeline/update.py old mode 100644 new mode 100755 index aa337a3a..40a94909 --- a/ibl_pipeline/update.py +++ b/ibl_pipeline/update.py @@ -3,7 +3,7 @@ ''' import datajoint as dj -schema = dj.schema('ibl_update') +schema = dj.schema(dj.config.get('database.prefix', '') + 'ibl_update') @schema @@ -27,6 +27,16 @@ class UpdateRecord(dj.Manual): """ +@schema +class UpdateError(dj.Manual): + definition = """ + -> UpdateRecord + update_action_ts=CURRENT_TIMESTAMP : timestamp # time stamp of the update error + --- + update_error_msg: varchar(255) + """ + + @schema class DeletionRecord(dj.Manual): definition = """ diff --git a/ibl_pipeline/utils/__init__.py b/ibl_pipeline/utils/__init__.py old mode 100644 new mode 100755 index e69de29b..d98bf5c4 --- a/ibl_pipeline/utils/__init__.py +++ b/ibl_pipeline/utils/__init__.py @@ -0,0 +1,9 @@ + +from uuid import UUID + +def is_valid_uuid(uuid): + try: + UUID(uuid) + return True + except ValueError: + return False diff --git a/ibl_pipeline/utils/atlas.py b/ibl_pipeline/utils/atlas.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/utils/delete_unused_external.py b/ibl_pipeline/utils/delete_unused_external.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/utils/dependent_tables.py b/ibl_pipeline/utils/dependent_tables.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/utils/dj_compare_table.py b/ibl_pipeline/utils/dj_compare_table.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/utils/dj_compare_table_archived.py b/ibl_pipeline/utils/dj_compare_table_archived.py old mode 100644 new mode 100755 diff --git a/ibl_pipeline/utils/psychofit.py b/ibl_pipeline/utils/psychofit.py old mode 100644 new mode 100755 diff --git a/images/acquisition.png b/images/acquisition.png new file mode 100644 index 00000000..d2de57a3 Binary files /dev/null and b/images/acquisition.png differ diff --git a/images/acquisition_erd.png b/images/acquisition_erd.png deleted file mode 100644 index fc1f2545..00000000 Binary files a/images/acquisition_erd.png and /dev/null differ diff --git a/images/action.png b/images/action.png new file mode 100644 index 00000000..e1d13f6a Binary files /dev/null and b/images/action.png differ diff --git a/images/all_erd.png b/images/all_erd.png deleted file mode 100644 index e3be2d92..00000000 Binary files a/images/all_erd.png and /dev/null differ diff --git a/images/alyx_erd.png b/images/alyx_erd.png deleted file mode 100644 index 74659a64..00000000 Binary files a/images/alyx_erd.png and /dev/null differ diff --git a/images/behavior.png b/images/behavior.png new file mode 100644 index 00000000..22e5eda2 Binary files /dev/null and b/images/behavior.png differ diff --git a/images/behavior_analyses.png b/images/behavior_analyses.png new file mode 100644 index 00000000..0e2cc27d Binary files /dev/null and b/images/behavior_analyses.png differ diff --git a/images/behavior_erd.png b/images/behavior_erd.png deleted file mode 100644 index da6eb3c3..00000000 Binary files a/images/behavior_erd.png and /dev/null differ diff --git a/images/data.png b/images/data.png new file mode 100644 index 00000000..40e6a7b2 Binary files /dev/null and b/images/data.png differ diff --git a/images/ephys.png b/images/ephys.png new file mode 100644 index 00000000..7f19e18b Binary files /dev/null and b/images/ephys.png differ diff --git a/images/ephys_erd.eps b/images/ephys_erd.eps deleted file mode 100644 index 26cefc64..00000000 Binary files a/images/ephys_erd.eps and /dev/null differ diff --git a/images/ephys_erd.png b/images/ephys_erd.png deleted file mode 100644 index 66c0dc7c..00000000 Binary files a/images/ephys_erd.png and /dev/null differ diff --git a/images/histology.png b/images/histology.png new file mode 100644 index 00000000..460e2f1f Binary files /dev/null and b/images/histology.png differ diff --git a/images/qc.png b/images/qc.png new file mode 100644 index 00000000..a2c7942e Binary files /dev/null and b/images/qc.png differ diff --git a/images/reference.png b/images/reference.png new file mode 100644 index 00000000..bf4eb1ab Binary files /dev/null and b/images/reference.png differ diff --git a/images/reference_erd.png b/images/reference_erd.png deleted file mode 100644 index de396b09..00000000 Binary files a/images/reference_erd.png and /dev/null differ diff --git a/images/subject.png b/images/subject.png new file mode 100644 index 00000000..e621adab Binary files /dev/null and b/images/subject.png differ diff --git a/images/subject_acquisition_reference_erd.png b/images/subject_acquisition_reference_erd.png deleted file mode 100644 index 0a166f27..00000000 Binary files a/images/subject_acquisition_reference_erd.png and /dev/null differ diff --git a/images/subject_erd.png b/images/subject_erd.png deleted file mode 100644 index 3107a631..00000000 Binary files a/images/subject_erd.png and /dev/null differ diff --git a/images/subject_reference_erd.png b/images/subject_reference_erd.png deleted file mode 100644 index 11c19dec..00000000 Binary files a/images/subject_reference_erd.png and /dev/null differ diff --git a/notebooks/notebooks_tutorial/202008_ephys_histology/00-Access database, schemas and tables.ipynb b/notebooks/notebooks_tutorial/202008_ephys_histology/00-Access database, schemas and tables.ipynb index b17855b0..5e514d98 100644 --- a/notebooks/notebooks_tutorial/202008_ephys_histology/00-Access database, schemas and tables.ipynb +++ b/notebooks/notebooks_tutorial/202008_ephys_histology/00-Access database, schemas and tables.ipynb @@ -280,7 +280,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.7.6" } }, "nbformat": 4, diff --git a/notebooks/notebooks_tutorial/202008_ephys_histology/01-Introduction of ephys and histology tables.ipynb b/notebooks/notebooks_tutorial/202008_ephys_histology/01-Introduction of ephys and histology tables.ipynb index d5394aaa..92261c9b 100644 --- a/notebooks/notebooks_tutorial/202008_ephys_histology/01-Introduction of ephys and histology tables.ipynb +++ b/notebooks/notebooks_tutorial/202008_ephys_histology/01-Introduction of ephys and histology tables.ipynb @@ -140,12 +140,20 @@ "source": [ "Here is a list of important histology tables:\n", "\n", - ">* InsertionDataSource: method to estimate the probe trajectory, including Ephys aligned histology track, Histology track, Micro-manipulator, and Planned \n", - ">* ProbeTrajectory: probe trajectory estimated with each method, ingested from Alyx table experiments.probetrajectory \n", - ">* ChannelBrainLocation: brain coordinates and region assignment of each channel, ingested from Alyx table experiments.channel \n", - ">* ClusterBrainRegion: Brain region assignment to each cluster \n", - ">* SessionBrainRegion: Brain regions assignment to each session, including the regions of finest granularity and their upper-level areas. \n", - ">* DepthBrainRegion: For each ProbeTrajectory, assign depth boundaries relative to the probe tip to each brain region covered by the trajectory \n" + ">* Provenance: method to estimate the probe trajectory, including Ephys aligned histology track, Histology track, Micro-manipulator, and Planned \n", + ">* ProbeTrajectoryTemp: probe trajectory estimated with each method, ingested from Alyx table experiments.probetrajectory \n", + ">* ChannelBrainLocationTemp: brain coordinates and region assignment of each channel, ingested from Alyx table experiments.channel \n", + ">* ClusterBrainRegionTemp: Brain region assignment to each cluster \n", + ">* ProbeBrainRegionTemp: Brain regions assignment to each probe, including the regions of finest granularity and their upper-level areas. \n", + ">* DepthBrainRegionTemp: For each ProbeTrajectoryTemp, assign depth boundaries relative to the probe tip to each brain region covered by the trajectory\n", + "\n", + "Tables not in active use and will be redefined:\n", + ">* InputDataSource: will be deleted\n", + ">* ProbeTrajectory: will be redefined to reflect the final probe trajectory with multiple users' approval, ingested from FlatIron data.\n", + ">* ChannelBrainLocation: will be redefined to reflect the final brain location assignment of each channel, ingested from FlatIron data.\n", + ">* ClusterBrainRegion: will be redefined to reflect the final brain region assignment of each cluster, based on ChannelBrainLocation data on Flatiron\n", + ">* ProbeBrainRegion: will be redefined to reflect the final brain regions assignment to each probe, including the regions of finest granularity and their upper-level areas.\n", + ">* DepthBrainRegion: will be redefined to reflect the final depth boundaries relative to the probe tip to each brain region covered by the trajectory" ] }, { @@ -260,6 +268,13 @@ "dj.U('metric_name') & ephys.DefaultCluster.Metric()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check [this document](https://docs.google.com/document/d/1ba_krsfm4epiAd0zbQ8hdvDN908P9VZOpTxkkH3P_ZY/edit) for the meaning of each metric." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -369,7 +384,7 @@ "metadata": {}, "source": [ "If we further need to filter with brain regions, we'll need to bring in several histology related tables.\n", - "The assignment of the brain region of each cluster is in the table `histology.ClusterBrainRegion`" + "The assignment of the brain region of each cluster is in the table `histology.ClusterBrainRegionTemp`" ] }, { @@ -378,14 +393,14 @@ "metadata": {}, "outputs": [], "source": [ - "histology.ClusterBrainRegion.describe();" + "histology.ClusterBrainRegionTemp.describe();" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The brain region assignment depends on the insertion data source:" + "The brain region assignment depends on the Provenance:" ] }, { @@ -394,14 +409,14 @@ "metadata": {}, "outputs": [], "source": [ - "histology.InsertionDataSource()" + "histology.Provenance()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The brain region assignment is usually only meaningful with the \"Ephys aligned histology track\"" + "The brain region assignment is usually only meaningful with the \"Ephys aligned histology track\", which has the highest provenance of 70." ] }, { @@ -410,7 +425,7 @@ "metadata": {}, "outputs": [], "source": [ - "histology.ClusterBrainRegion & 'insertion_data_source = \"Ephys aligned histology track\"'" + "histology.ClusterBrainRegionTemp & 'provenance=70'" ] }, { @@ -443,7 +458,7 @@ "metadata": {}, "outputs": [], "source": [ - "histology.ClusterBrainRegion & 'insertion_data_source = \"Ephys aligned histology track\"' & fronto_pole & good_performance_clusters" + "histology.ClusterBrainRegionTemp & 'provenance = 70' & fronto_pole & good_performance_clusters" ] }, { @@ -589,7 +604,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.7.6" } }, "nbformat": 4, diff --git a/notebooks/notebooks_tutorial/202011_qc/00-Access database, schemas and tables.ipynb b/notebooks/notebooks_tutorial/202011_qc/00-Access database, schemas and tables.ipynb new file mode 100644 index 00000000..e4a10864 --- /dev/null +++ b/notebooks/notebooks_tutorial/202011_qc/00-Access database, schemas and tables.ipynb @@ -0,0 +1,288 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This set of tutorials aim to introduce the important ephys and histology tables that are ready for usage. We will mention some basics of DataJoint but not systematically. For a full-fledged tutorial on the basics, please visit: \n", + "\n", + ">* [Get DataJoint Ready](../201909_code_camp/0-Get%20DataJoint%20Ready.ipynb): connection to database, set up config\n", + ">* [Explore IBL data pipeline with DataJoint](../201909_code_camp/1-Explore%20IBL%20data%20pipeline%20with%20DataJoint.ipynb): plot diagram, query, and fetch\n", + ">* [Analyze data with IBL pipeline and save results](../201909_code_camp/2-Analyze%20data%20with%20IBL%20pipeline%20and%20save%20results.ipynb): use imported and computed table to autopopulate results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Connect to IBL datajoint database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datajoint as dj\n", + "from getpass import getpass\n", + "\n", + "# set up dj.config\n", + "dj.config['database.host'] = 'datajoint.internationalbrainlab.org'\n", + "dj.config['database.user'] = '{YOUR_USER_NAME}'\n", + "dj.config['database.password'] = getpass('Please type in your password: ')\n", + "\n", + "# connect to the database\n", + "dj.conn()\n", + "\n", + "# save the config locally\n", + "dj.config.save_local()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# List all the schemas you have access to, using `dj.list_schemas()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dj.list_schemas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Major schemas: \n", + "Meta data from **Alyx**: `ibl_reference`, `ibl_subject`, `ibl_action`, `ibl_acquisition`, `ibl_data`, and `ibl_qc` \n", + "Imported data from **FlatIron**: `ibl_behavior`, `ibl_ephys`, `ibl_histology` \n", + "Computed analzyed results: `ibl_analyses_behavior`, `ibl_analyses_ephys` " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Access the schemas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are two ways of accessing the schemas with DataJoint\n", + "\n", + ">* Create virtual modules\n", + ">* Import modules from ibl-pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create virtual modules \n", + "The tables are designed and generated with DataJoint and the codes are in ibl-pipeline, however, if you just want to access the table contents, you don't have to get the code that defines the tables. Instead, DataJoint provides an method called `create_virtual_module`, allowing users to reconstruct the modules and classes based on the **current** structure of the tables in the database. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ephys = dj.create_virtual_module('ephys', 'ibl_ephys')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first argument is the `__name__` of the module you would like to give, usually not very important. The second argument is the schema name." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we get the virtual module `ephys`, that contains all the classes to interact with the tables in the schema. Apart from the populate methods, you could do all other DJ operations on this virtual module, including diagram, queries, fetches, create child tables, delete, and drop. Please be extra coutious in deleting and dropping tables." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the relational diagram of the module:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dj.Diagram(ephys)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is a friendly reminder of what these shapes, colors and lines mean:\n", + "\n", + "**Table tiers**: \n", + "Manual table: green box \n", + "Lookup table: gray box \n", + "Imported table: blue oval \n", + "Computed table: red circle \n", + "Part table: plain text\n", + "\n", + "Meaning of table tiers could be found in this [presentation](https://docs.google.com/presentation/d/1mp3Bro1_o_nPScD_g0ygw2z633Rdnd-GGlFEJZmhrBs/edit#slide=id.g7e7b39a7dc_0_5)\n", + "\n", + "**Dependencies**: \n", + "One-to-one primary: thick solid line \n", + "One-to-many primary: thin solid line \n", + "Secondary foreign key reference: dashed line \n", + "Renamed secondary foreign key references: orange dot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We could access tables with the classes of the virtual module. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ephys.DefaultCluster().describe();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create virtual modules are particularly useful in the following scenarios:\n", + "\n", + ">* `group_shared_` schemas: these are the schemas created by the users, and the code to create these modules are not necessarily accessible easily.\n", + ">* `ibl_` schemas: these schemas were created and defined in ibl-pipeline, but as we are in rapid development, the lastest ibl-pipeline package we released may not reflect the current status of the tables. Create virtual modules is a very good way to access the tables with their current definitions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For ephys tables, there are a lot of external fields, such as the `blob@ephys` shown in the above definition. External storage is a feature provided by DataJoint that allows saving bulky data into s3 buckets. From the user point of view, there is no difference from a internal field. However, using external fields need to pre-configure the storage location. Without the configuration, datajoint does not know where to fetch the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# fetch the first two entries\n", + "ephys.DefaultCluster.fetch('cluster_spikes_times', limit=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To fix the problem, we could `import ibl_pipeline`, where the external storage location was configured. The configuration is stable across different versions of ibl_pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import ibl_pipeline\n", + "ephys.DefaultCluster.fetch('cluster_spikes_times', limit=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Directly import from ibl-pipeline\n", + "\n", + "A more routined method is to directly import modules from the package `ibl-pipeline`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from ibl_pipeline import ephys, histology" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ephys.DefaultCluster()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "histology.ClusterBrainRegion() & 'insertion_data_source like \"%Ephys%\"'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we introduced the approaches to connect to the database, access schemas and tables. We especially illustrated the usage of `dj.create_virtual_module`, which is quite useful when accessing the rapidly changing schemas and tables.\n", + "\n", + "In the [next notebook](01-Introduction%20of%20ephys%20and%20histology%20tables.ipynb), we will go through the important tables in ephys and histology schemas one-by-one." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/notebooks_tutorial/202011_qc/01-Introduction to QC tables.ipynb b/notebooks/notebooks_tutorial/202011_qc/01-Introduction to QC tables.ipynb new file mode 100644 index 00000000..26ca8bee --- /dev/null +++ b/notebooks/notebooks_tutorial/202011_qc/01-Introduction to QC tables.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook will walk you through the important tables in the qc schema and how to use them to filter the sessions you need" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import datajoint and modules from ibl_pipeline\n", + "import datajoint as dj\n", + "from ibl_pipeline import reference, subject, acquisition, behavior\n", + "from ibl_pipeline.analyses import behavior as behavior_analyses\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "QC tables are still in active development. We therefore recommend accessing them with `dj.create_virtual_module()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qc = dj.create_virtual_module('qc', 'ibl_qc')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# QC tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dj.Diagram(qc) + acquisition.Session" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This diagram shows the QC related tables, and here is the description of the tables." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + ">* QCChoice: Available flags to quantify the quality of a session or a specific aspect of a session, lookup table got referred in SessionQC and SessionExtendedQC, 50 for CRITICAL, 40 for FAIL, 30 for WARNING, 10 for PASS, and 0 for NOT SET \n", + ">* SessionQC: QCChoice for each session, ingested from alyx field `qc` in the table `actions.session` \n", + ">* QCType: Aspect of a session for quality check. e.g. task, behavior, experimenter… \n", + ">* SessionExtendedQC: QCChoice (e.g. FAIL) for a QCType (e.g. task) for each session, structured data about SessionQC \n", + ">* SessionExtendedQC.Field: Part table of SessionExtendedQC. For each entry of SessionExtendedQC, there may be multiple fields describing each value (e.g. 0.99) of a qc aspect (e.g. _task_stimOn_delays) that belongs to a QCType (e.g. task).\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Detailed table definitions could be easily checked with the method `describe()`, for example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qc.QCChoice.describe();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Preview the contents of the table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qc.QCChoice()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Useful queries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sessions better than CRITICAL?\n", + "acquisition.Session & (qc.SessionQC & 'qc < 50')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sessions better than critical and also good enough for brainwide map?\n", + "\n", + "acquisition.Session & (qc.SessionQC & 'qc < 50') & \\\n", + "(behavior_analyses.SessionTrainingStatus & 'good_enough_for_brainwide_map')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sessions better than critical for task criteria?\n", + "\n", + "acquisition.Session & (qc.SessionExtendedQC & 'qc_type=\"task\"' & 'extended_qc < 40')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we listed a few query examples related to the qc schema that might be helpful for your research. For a full fledged introduction of major types of queries and fetches, please refer to [this notebook](../201909_code_camp/1-Explore%20IBL%20data%20pipeline%20with%20DataJoint.ipynb) during the 2019 IBL Code Camp." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/scripts/delete_shadow_tables_for_updates.py b/scripts/delete_shadow_tables_for_updates.py index 277c40e3..673d0d91 100755 --- a/scripts/delete_shadow_tables_for_updates.py +++ b/scripts/delete_shadow_tables_for_updates.py @@ -1,7 +1,7 @@ import datajoint as dj from ibl_pipeline.ingest import alyxraw, data from ibl_pipeline.ingest import subject, action, acquisition, ephys - +from tqdm import tqdm if __name__ == '__main__': @@ -12,7 +12,7 @@ file_record_fields = alyxraw.AlyxRaw.Field & \ 'fname = "exists"' & 'fvalue = "False"' - for key in file_record_fields: + for key in tqdm(file_record_fields): (alyxraw.AlyxRaw.Field & key).delete_quick() # delete water tables and related alyxraw entries @@ -42,7 +42,7 @@ print('Deleting trajectories estimates time stamp...') (alyxraw.AlyxRaw.Field & (alyxraw.AlyxRaw & 'model="experiments.trajectoryestimate"') & - 'fname="datetime"').delete() + 'fname="datetime"').delete(force=True) # delete some shadow membership tables diff --git a/scripts/ingest_all.sh b/scripts/ingest_all.sh index 7a36d9ba..719f77d5 100755 --- a/scripts/ingest_all.sh +++ b/scripts/ingest_all.sh @@ -1,5 +1,6 @@ - - +date +echo "Deleting alyx shadow tables for updates..." +python delete_shadow_tables_for_updates.py date echo "Ingesting alyx raw..." python ingest_alyx_raw.py "$@" @@ -22,7 +23,12 @@ date echo "Auto updating subject fields..." python auto_update_subject_fields.py date +<<<<<<< HEAD +python auto_update_trajectories.py +date +======= echo "Populating behavior tables..." +>>>>>>> d52428199b75e85e94dc8ba56cb96bf3fa0c44c2 python ingest_behavior.py date echo "Populating behavior analyses tables..." diff --git a/scripts/ingest_increment.py b/scripts/ingest_increment.py new file mode 100644 index 00000000..b6aa5f3d --- /dev/null +++ b/scripts/ingest_increment.py @@ -0,0 +1,5 @@ +from ibl_pipeline.process import autoprocess, get_timezone, process_histology, process_qc + +autoprocess.process_new(timezone=get_timezone()) +process_histology.main() +process_qc.main() diff --git a/scripts/updates/add_empty_entries_to_aligned_times.py b/scripts/updates/add_empty_entries_to_aligned_times.py new file mode 100644 index 00000000..04eeb6ab --- /dev/null +++ b/scripts/updates/add_empty_entries_to_aligned_times.py @@ -0,0 +1,34 @@ + +from ibl_pipeline import behavior, acquisition, ephys +from tqdm import tqdm +import numpy as np +from uuid import UUID +import datetime + +events = ephys.Event & 'event in ("feedback", "movement", "response", "stim on")' + +trial_sets = behavior.TrialSet & ephys.AlignedTrialSpikes + +for trial_set in tqdm(trial_sets): + + clusters = ephys.DefaultCluster & trial_set + + for event in events: + + print('Inserting missing entries for {}...'.format(event['event'])) + + if ephys.AlignedTrialSpikes & event & clusters: + for cluster in tqdm(clusters.aggr( + ephys.AlignedTrialSpikes & event, + n_trials='count(trial_id)') & 'n_trials < {}'.format( + len(behavior.TrialSet.Trial & trial_set))): + trials_missing = (behavior.TrialSet.Trial & cluster) - \ + (ephys.AlignedTrialSpikes & cluster & event) + entries = (ephys.DefaultCluster.proj() * + trials_missing.proj() * + ephys.Event.proj() & event & cluster).fetch(as_dict=True) + + ephys.AlignedTrialSpikes.insert( + [dict(**e, trial_spike_times=np.array([])) for e in entries], + skip_duplicates=True, allow_direct_insert=True + ) diff --git a/setup.py b/setup.py index 039b2c29..fc4dc2f4 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='ibl_pipeline', - version='0.2.2', + version='0.2.3', description='Datajoint schemas for IBL', author='Vathes', author_email='support@vathes.com',