MetaSUB · dcdanko · Dec 28, 2018 · Jan 2, 2019
diff --git a/README.md b/README.md
@@ -1,15 +1,18 @@
-# Overview
+# MetaSUB Reproducible Metadata
 
-This repository is a collection of raw MetaSUB metadata with annotation and scripts to produce cleaned metadata.
+This repository is a collection of raw MetaSUB metadata with annotation and scripts to produce cleaned metadata. It is reproducible in the sense that it generates metadata directly from documentary files without relying on a person to collate files.
 
-To build a clean metadata table run the following commands
+A metadata table can be generated with this command: `metasub-generator best-effort > metadata.csv`
 
-`python setup.py develop` to install
+[This file](https://github.com/MetaSUB/MetaSUB-metadata/blob/ba04acb5bf6df79d2a9cdd30178f80386b6475c0/complete_metadata.csv) is the most recent, complete, and stable version of the metadata (v1.0.1).
 
-`metasub-generator best-effort --sample-names <list of names> > metadata.csv`
-to generate a metadata table.
+## Installation
 
-`--sample-names` may be omitted to use a default list of sample names `spreadsheets/sample_names.txt`
+```
+git clone [email protected]:MetaSUB/MetaSUB-metadata.git
+cd MetaSUB-metadata
+python setup.py develop
+```
 
 ## Spreadsheets
 
@@ -68,36 +71,5 @@ to generate a metadata table.
 
 `map_collate` (`mappers.py`, `sample_wise.py`, `sample.py`) a miniature package that defined a best-effort approach to matching metadata with variosu forms of identification. The key insight is that this package is intended to deal with many layers of indirection that may originate in many different places. Relies on a large number of metadata files many of which were themselves edited and cleaned.
 
-## Known Unknowns
-
- - Metadata for NYC Winter
- - Metadata for PathoMAP
- - Metadata for Pilot
- - Metadata for many gCSD16 samples
- - Metadata for Olympiome
-
-## What are the MetaSUB sample collection projects?
- - CSD-2016
- - CSD-2017
- - Tigress
- - Pilot (Early 2016)
- - NYC Winter
- - Olympiome 2016
-
-## Known Irregularities
-
-London did not use the Kobo Toolbox app in gCSD17
-
-## What are the MetaSUB sample extraction and sequencing batches?
-
-N.B. these were largely informal and are ongoing
-
- - ~5k samples extracted at qiagen, then shipped to hudsonalpha (we have an inventory?)
- - Some samples were sent to Hong Kong for extraction, library prep and sequencing (Hong Kong and Shanghai)
- -  Some samples have been kept in Moscow for extraction, library prep and sequencing
- -  London has shipped barcoded tubes from csd2017 to Shanghai. Then Shanghai has sent WCM the samples
- -  UK sent samples to Zymo. Zymo does not have barcode reader so they're labeling by position (this is resolved)
- -  Stockholm extracted samples from Stockholm and other cities and sent the extracted samples to hudsonalpha
- -  We have <a number> of samples in our freezers
 
 
diff --git a/complete_metadata.csv b/complete_metadata.csv
diff --git a/generators/cli.py b/generators/cli.py
@@ -42,7 +42,6 @@ def best_effort(csv, sample_names):
 
     if csv:
         tbl = pd.DataFrame([sample.to_son() for sample in samples])
-        print(tbl.to_csv())
         tbl = tbl.sort_values(by=['core_project', 'project', 'city', 'metasub_name', 'uuid'])
         tbl = tbl[[
             'uuid',
@@ -89,6 +88,10 @@ def best_effort(csv, sample_names):
             'sample_type',
             'sl_name',
 
+            R1_URI,
+            R2_URI,
+            CONTIG_URI,
+
         ]]
         tbl = tbl.set_index(GENERIC_UID)
         print(tbl.to_csv())

diff --git a/generators/constants.py b/generators/constants.py
@@ -50,13 +50,16 @@
 PROJECT = 'project'
 SAMPLE_TYPE = 'sample_type'
 LOCATION_TYPE = 'location_type'
-IDS = set([HAUID, HA_ID, BC, METASUB_NAME, SL_NAME, OTHER_PROJ_UID])
+IDS = set([GENERIC_UID, HAUID, HA_ID, BC, METASUB_NAME, SL_NAME, OTHER_PROJ_UID])
 CONTROL_STATUS = 'control_type'
 STATION = 'station'
 LINE = 'line'
 INDEX_SEQ = 'index_sequence'
 TEMPERATURE = 'temperature'
 CORE_PROJECT = 'core_project'
+R1_URI = 'read_1_uri'
+R2_URI = 'read_2_uri'
+CONTIG_URI = 'contig_uri'
 
 POSITIVE_CONTROL = 'positive_control'
 NEGATIVE_CONTROL = 'negative_control'

diff --git a/generators/simple_tables.py b/generators/simple_tables.py
@@ -525,6 +525,32 @@ def map_func(sample, sample_id, vec):
 ################################################################################
 
 
+positions = {
+    HAUID: 0,
+    R1_URI: 1,
+    R2_URI: 2,
+}
+wasabi_fastqs = Table(
+    mdata_dir('wasabi_raw_fastq_uris.csv'),
+    positions,
+    token_mapper(*list(positions.keys())),
+)
+
+
+positions = {
+    HAUID: 0,
+    CONTIG_URI: 1,
+}
+wasabi_contigs = Table(
+    mdata_dir('wasabi_contig_uris.csv'),
+    positions,
+    token_mapper(*list(positions.keys())),
+)
+
+
+################################################################################
+
+
 SIMPLE_TABLES = [
     barcelona_csd16,
     haid_to_barcode_4959DB,
@@ -549,4 +575,6 @@ def map_func(sample, sample_id, vec):
     haid_to_csdid_2,
     pathomap_winter,
     read_counts,
+    wasabi_contigs,
+    wasabi_fastqs,
 ]
diff --git a/setup.py b/setup.py
@@ -1,9 +1,9 @@
 import setuptools
 
 setuptools.setup(
-    name="metasub-scripts",
+    name="metasub-metadata",
     version="0.9.0",
-    url="https://github.com/MetaSUB/metasub-scripts",
+    url="https://github.com/MetaSUB/metasub-metadata",
 
     author="David C. Danko",
     author_email="[email protected]",