From fb3f91a9aa8a35822da42eacb781e3165e2000b5 Mon Sep 17 00:00:00 2001
From: Roman Chernyatchik <roman.chernyatchik@jetbrains.com>
Date: Mon, 11 Dec 2017 13:34:13 +0300
Subject: [PATCH] Annotate Reactome using offline database files #35

---
 reports/annotate_reactome.py                | 77 ++++++++++++------
 test/test_annotate_reactome.py              | 90 ++++++++++++++-------
 test/testdata/reactome/ReactomePathways.txt | 10 +++
 3 files changed, 123 insertions(+), 54 deletions(-)
 create mode 100644 test/testdata/reactome/ReactomePathways.txt

diff --git a/reports/annotate_reactome.py b/reports/annotate_reactome.py
index 0f4c9c3..f821094 100644
--- a/reports/annotate_reactome.py
+++ b/reports/annotate_reactome.py
@@ -35,22 +35,45 @@ def inner(*args, sleep_s=sleep_s, **kw):
     return deco
 
 
-@_lxml_try(sleep_s=0.01)
-def fetch_title(s):
-    reactome_id = extract_reactome_id(s)
-
-    if not reactome_id:
-        return "N/A"
+def fetch_title(*strs, pathways_db_path=None):
+    if pathways_db_path:
+        df = pd.read_csv(pathways_db_path, sep="\t", header=None,
+                         names=["title", "species"], index_col=0)
     else:
-        url = "http://www.reactome.org/content/detail/" + reactome_id
-        t = lxml.html.parse(url)
-        return t.find(".//title").text.replace("Reactome | ", "")
+        df = None
+
+    titles = []
+    for s in strs:
+        reactome_id = extract_reactome_id(s)
+
+        if reactome_id:
+            if df is not None:
+                title = _db_fetch_title(reactome_id, df)
+            else:
+                title = _web_fetch_title(reactome_id)
+        else:
+            title = "N/A"
+
+        titles.append(title)
+
+    return titles
+
+
+def _db_fetch_title(reactome_id, df):
+    return df.title.loc[reactome_id]
+
+
+@_lxml_try(sleep_s=0.01)
+def _web_fetch_title(reactome_id):
+    url = "http://www.reactome.org/content/detail/" + reactome_id
+    t = lxml.html.parse(url)
+    return t.find(".//title").text.replace("Reactome | ", "")
 
 
 def extract_reactome_id(s):
     match = re.match(".*R-HSA-(\d+).*", str(s), flags=re.IGNORECASE)
     if match:
-        reactome_id = match.group(1)
+        reactome_id = "R-HSA-{}".format(match.group(1))
     else:
         reactome_id = None
     return reactome_id
@@ -64,6 +87,8 @@ def _cli():
     )
     parser.add_argument('path', metavar="PATH",
                         help="Table path")
+    parser.add_argument('--db', metavar="PATH",
+                        help="ReactomePathways.txt file path")
 
     parser.add_argument('col', metavar="INT_OR_STR",
                         help="Index (>=1) or name of column containing Reactome pathway id. If "
@@ -80,6 +105,7 @@ def _cli():
     sep = args.sep
     col = args.col
     output_path = args.output_path
+    pathways_db_path = args.db
 
     try:
         col_i = int(col)
@@ -108,21 +134,26 @@ def _cli():
     if col_idx is None:
         print("Table header:", ", ".join("'{}'".format(c) for c in df.columns), file=sys.stderr)
         if index_col is not None:
-            reactome_ids = df.index
+            reactome_links = df.index
         else:
-            reactome_ids = df.loc[:, col]
+            reactome_links = df.loc[:, col]
     else:
-        reactome_ids = df.iloc[:, col_idx]
-
-    titles = []
-    print("Fetch titles for:", len(reactome_ids), "records", file=sys.stderr)
-    for i, s in enumerate(reactome_ids, 1):
-        titles.append(fetch_title(s))
-        if i % 100 == 0:
-            print('\n{} of {}'.format(i, len(reactome_ids)), file=sys.stderr)
-        else:
-            print('.', end="", file=sys.stderr)
-            sys.stderr.flush()
+        reactome_links = df.iloc[:, col_idx]
+
+    print("Fetch titles for:", len(reactome_links), "records", file=sys.stderr)
+    if not pathways_db_path:
+        # online mode:
+        titles = []
+        for i, s in enumerate(reactome_links, 1):
+            titles.append(fetch_title(s))
+            if i % 100 == 0:
+                print('\n{} of {}'.format(i, len(reactome_links)), file=sys.stderr)
+            else:
+                print('.', end="", file=sys.stderr)
+                sys.stderr.flush()
+    else:
+        # offline mode
+        titles = fetch_title(*reactome_links, pathways_db_path=pathways_db_path)
     print(file=sys.stderr)
 
     df['reactome_titles'] = titles
diff --git a/test/test_annotate_reactome.py b/test/test_annotate_reactome.py
index 743df5c..2cc5be3 100644
--- a/test/test_annotate_reactome.py
+++ b/test/test_annotate_reactome.py
@@ -9,25 +9,42 @@
 @pytest.mark.parametrize("label,rid", [
     ("foo", None),
     ("2644605", None),
-    ("R-HSA-2644605", "2644605"),
-    ("r-hsa-2644605", "2644605"),
-    ("foo_R-HSA-2644605_boo", "2644605"),
-    ("foo_r-hsa-2644605_boo", "2644605"),
-    ("foo.R-HSA-2644605-boo", "2644605"),
-    ("foo-R-HSA-2644605.boo", "2644605"),
-    ("foo-R-HSA-2644605boo", "2644605"),
-    ("http://www.reactome.org/content/detail/R-HSA-2644605", "2644605"),
+    ("R-HSA-2644605", "R-HSA-2644605"),
+    ("r-hsa-2644605", "R-HSA-2644605"),
+    ("foo_R-HSA-2644605_boo", "R-HSA-2644605"),
+    ("foo_r-hsa-2644605_boo", "R-HSA-2644605"),
+    ("foo.R-HSA-2644605-boo", "R-HSA-2644605"),
+    ("foo-R-HSA-2644605.boo", "R-HSA-2644605"),
+    ("foo-R-HSA-2644605boo", "R-HSA-2644605"),
+    ("http://www.reactome.org/content/detail/R-HSA-2644605", "R-HSA-2644605"),
 ])
 def test_extract_reactome_id(label, rid):
     assert rid == extract_reactome_id(label)
 
 
-@pytest.mark.parametrize("label,title", [
-    ("foo", "N/A"),
-    ("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer"),
+@pytest.mark.parametrize("label,title,offline", [
+    ("foo", "N/A", False),
+    ("foo", "N/A", True),
+    ("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer", False),
+    ("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer", True),
 ])
-def test_fetch_title(label, title):
-    assert title == fetch_title(label)
+def test_fetch_title(test_data, label, title, offline):
+    kw = {}
+    if offline:
+        kw['pathways_db_path'] = test_data("reactome/ReactomePathways.txt")
+
+    assert title == fetch_title(label, **kw)[0]
+
+
+@pytest.mark.parametrize("offline", [True, False])
+def test_fetch_titles(test_data, offline):
+    kw = {}
+    if offline:
+        kw['pathways_db_path'] = test_data("reactome/ReactomePathways.txt")
+
+    assert ['FBXW7 Mutants and NOTCH1 in Cancer', '2-LTR circle formation'] == fetch_title(
+        "R-HSA-2644605", "R-HSA-164843", **kw
+    )
 
 
 def test_cli_help(capfd):
@@ -35,7 +52,7 @@ def test_cli_help(capfd):
         "-h")
     output = capfd.readouterr()
     assert """python {}/reports/annotate_reactome.py -h
-usage: annotate_reactome.py [-h] [-F FS] [-o PATH] PATH INT_OR_STR
+usage: annotate_reactome.py [-h] [--db PATH] [-F FS] [-o PATH] PATH INT_OR_STR
 
 Annotates table containing http://www.reactome.org pathways ids (R-HSA-nnnnn)
 with pathways titles
@@ -47,37 +64,48 @@ def test_cli_help(capfd):
 
 optional arguments:
   -h, --help  show this help message and exit
+  --db PATH   ReactomePathways.txt file path (default: None)
   -F FS       Field separator, auto-detect if not specified (default: None)
   -o PATH     Output path (default: None)
 """.format(PROJECT_ROOT_PATH) == output[0]
     assert "" == output[1]
 
 
-@pytest.mark.parametrize("file,args,result_stdout,result_stderr,fresult", [
-    ("empty.csv", ["1"], "", "File is empty: ", None),
-    ("no_header_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
-    ("no_header_comma.csv", ["0"], "", "Column index should be >= 1, but was: 0", None),
-    ("no_header_comma.csv", ["1"], "", "", "no_header_comma.result.txt"),
-    ("header_index_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
-    ("header_index_comma.csv", ["1"], "", "", "header_index_comma.result1.txt"),
-    ("header_index_comma.csv", ["''"], "", "", "header_index_comma.result2.txt"),
-    ("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
-    ("header_colname_comma.csv", ["1"], "", "", "header_colname_comma.result1.txt"),
-    ("header_colname_comma.csv", ["data"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
-    ("header_colname_comma.csv", ["data"], "", "", "header_colname_comma.result2.txt"),
+@pytest.mark.parametrize("file,args,result_stdout,result_stderr,fresult,offline", [
+    # offline
+    ("empty.csv", ["1"], "", "File is empty: ", None, True),
+    ("no_header_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
+    ("no_header_comma.csv", ["0"], "", "Column index should be >= 1, but was: 0", None, True),
+    ("no_header_comma.csv", ["1"], "", "", "no_header_comma.result.txt", True),
+    ("header_index_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
+    ("header_index_comma.csv", ["1"], "", "", "header_index_comma.result1.txt", True),
+    ("header_index_comma.csv", ["''"], "", "", "header_index_comma.result2.txt", True),
+    ("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
+    ("header_colname_comma.csv", ["1"], "", "", "header_colname_comma.result1.txt", True),
+    ("header_colname_comma.csv", ["data"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
+    ("header_colname_comma.csv", ["data"], "", "", "header_colname_comma.result2.txt", True),
     ("header_colname_comma.csv", ["data", "-F','"], "FBXW7 Mutants and NOTCH1 in Cancer", "",
-     None),
-    ("header_colname_tab.tsv", ["data"], "", "", "header_colname_comma.result2.txt"),
-    ("header_colname_tab.tsv", ["data", "-F'\t'"], "", "", "header_colname_tab.result.txt"),
+     None, True),
+    ("header_colname_tab.tsv", ["data"], "", "", "header_colname_comma.result2.txt", True),
+    ("header_colname_tab.tsv", ["data", "-F'\t'"], "", "", "header_colname_tab.result.txt", True),
     ("header_colname_comma.csv", ["data", "-F' '"], "",
-     "KeyError: 'the label [data] is not in the [columns]'", None),
+     "KeyError: 'the label [data] is not in the [columns]'", None, True),
+
+    # using web access:
+    ("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, False),
 ])
-def test_foo(test_data, tmp_path, capfd, file, args, result_stdout, result_stderr, fresult):
+def test_cli(test_data, tmp_path, capfd, file, args, result_stdout, result_stderr, fresult,
+             offline):
+
+    pathways_db_path = test_data("reactome/ReactomePathways.txt") if offline else None
     input_path = str(test_data("reactome/" + file))
 
     if fresult:
         args.append("-o " + str(tmp_path / "result.txt"))
 
+    if pathways_db_path:
+        args.append("--db " + pathways_db_path)
+
     run(
         "python", "{}/reports/annotate_reactome.py".format(PROJECT_ROOT_PATH),
         input_path,
diff --git a/test/testdata/reactome/ReactomePathways.txt b/test/testdata/reactome/ReactomePathways.txt
new file mode 100644
index 0000000..739db97
--- /dev/null
+++ b/test/testdata/reactome/ReactomePathways.txt
@@ -0,0 +1,10 @@
+"R-ATH-73843"	"5-Phosphoribose 1-diphosphate biosynthesis"	"Arabidopsis thaliana"
+"R-ATH-1369062"	"ABC transporters in lipid homeostasis"	"Arabidopsis thaliana"
+"R-HSA-168276"	" NS1 Mediated Effects on Host Pathways"	"Homo sapiens"
+"R-HSA-164843"	"2-LTR circle formation"	"Homo sapiens"
+"R-HSA-73843"	"5-Phosphoribose 1-diphosphate biosynthesis"	"Homo sapiens"
+"R-HSA-1971475"	"A tetrasaccharide linker sequence is required for GAG synthesis"	"Homo sapiens"
+"R-HSA-5619084"	"ABC transporter disorders"	"Homo sapiens"
+"R-HSA-2644605"	"FBXW7 Mutants and NOTCH1 in Cancer"	"Homo sapiens"
+"R-XTR-379724"	"tRNA Aminoacylation"	"Xenopus tropicalis"
+"R-XTR-199992"	"trans-Golgi Network Vesicle Budding"	"Xenopus tropicalis"
\ No newline at end of file