From fb3f91a9aa8a35822da42eacb781e3165e2000b5 Mon Sep 17 00:00:00 2001 From: Roman Chernyatchik Date: Mon, 11 Dec 2017 13:34:13 +0300 Subject: [PATCH] Annotate Reactome using offline database files #35 --- reports/annotate_reactome.py | 77 ++++++++++++------ test/test_annotate_reactome.py | 90 ++++++++++++++------- test/testdata/reactome/ReactomePathways.txt | 10 +++ 3 files changed, 123 insertions(+), 54 deletions(-) create mode 100644 test/testdata/reactome/ReactomePathways.txt diff --git a/reports/annotate_reactome.py b/reports/annotate_reactome.py index 0f4c9c3..f821094 100644 --- a/reports/annotate_reactome.py +++ b/reports/annotate_reactome.py @@ -35,22 +35,45 @@ def inner(*args, sleep_s=sleep_s, **kw): return deco -@_lxml_try(sleep_s=0.01) -def fetch_title(s): - reactome_id = extract_reactome_id(s) - - if not reactome_id: - return "N/A" +def fetch_title(*strs, pathways_db_path=None): + if pathways_db_path: + df = pd.read_csv(pathways_db_path, sep="\t", header=None, + names=["title", "species"], index_col=0) else: - url = "http://www.reactome.org/content/detail/" + reactome_id - t = lxml.html.parse(url) - return t.find(".//title").text.replace("Reactome | ", "") + df = None + + titles = [] + for s in strs: + reactome_id = extract_reactome_id(s) + + if reactome_id: + if df is not None: + title = _db_fetch_title(reactome_id, df) + else: + title = _web_fetch_title(reactome_id) + else: + title = "N/A" + + titles.append(title) + + return titles + + +def _db_fetch_title(reactome_id, df): + return df.title.loc[reactome_id] + + +@_lxml_try(sleep_s=0.01) +def _web_fetch_title(reactome_id): + url = "http://www.reactome.org/content/detail/" + reactome_id + t = lxml.html.parse(url) + return t.find(".//title").text.replace("Reactome | ", "") def extract_reactome_id(s): match = re.match(".*R-HSA-(\d+).*", str(s), flags=re.IGNORECASE) if match: - reactome_id = match.group(1) + reactome_id = "R-HSA-{}".format(match.group(1)) else: reactome_id = None return reactome_id @@ -64,6 +87,8 @@ def _cli(): ) parser.add_argument('path', metavar="PATH", help="Table path") + parser.add_argument('--db', metavar="PATH", + help="ReactomePathways.txt file path") parser.add_argument('col', metavar="INT_OR_STR", help="Index (>=1) or name of column containing Reactome pathway id. If " @@ -80,6 +105,7 @@ def _cli(): sep = args.sep col = args.col output_path = args.output_path + pathways_db_path = args.db try: col_i = int(col) @@ -108,21 +134,26 @@ def _cli(): if col_idx is None: print("Table header:", ", ".join("'{}'".format(c) for c in df.columns), file=sys.stderr) if index_col is not None: - reactome_ids = df.index + reactome_links = df.index else: - reactome_ids = df.loc[:, col] + reactome_links = df.loc[:, col] else: - reactome_ids = df.iloc[:, col_idx] - - titles = [] - print("Fetch titles for:", len(reactome_ids), "records", file=sys.stderr) - for i, s in enumerate(reactome_ids, 1): - titles.append(fetch_title(s)) - if i % 100 == 0: - print('\n{} of {}'.format(i, len(reactome_ids)), file=sys.stderr) - else: - print('.', end="", file=sys.stderr) - sys.stderr.flush() + reactome_links = df.iloc[:, col_idx] + + print("Fetch titles for:", len(reactome_links), "records", file=sys.stderr) + if not pathways_db_path: + # online mode: + titles = [] + for i, s in enumerate(reactome_links, 1): + titles.append(fetch_title(s)) + if i % 100 == 0: + print('\n{} of {}'.format(i, len(reactome_links)), file=sys.stderr) + else: + print('.', end="", file=sys.stderr) + sys.stderr.flush() + else: + # offline mode + titles = fetch_title(*reactome_links, pathways_db_path=pathways_db_path) print(file=sys.stderr) df['reactome_titles'] = titles diff --git a/test/test_annotate_reactome.py b/test/test_annotate_reactome.py index 743df5c..2cc5be3 100644 --- a/test/test_annotate_reactome.py +++ b/test/test_annotate_reactome.py @@ -9,25 +9,42 @@ @pytest.mark.parametrize("label,rid", [ ("foo", None), ("2644605", None), - ("R-HSA-2644605", "2644605"), - ("r-hsa-2644605", "2644605"), - ("foo_R-HSA-2644605_boo", "2644605"), - ("foo_r-hsa-2644605_boo", "2644605"), - ("foo.R-HSA-2644605-boo", "2644605"), - ("foo-R-HSA-2644605.boo", "2644605"), - ("foo-R-HSA-2644605boo", "2644605"), - ("http://www.reactome.org/content/detail/R-HSA-2644605", "2644605"), + ("R-HSA-2644605", "R-HSA-2644605"), + ("r-hsa-2644605", "R-HSA-2644605"), + ("foo_R-HSA-2644605_boo", "R-HSA-2644605"), + ("foo_r-hsa-2644605_boo", "R-HSA-2644605"), + ("foo.R-HSA-2644605-boo", "R-HSA-2644605"), + ("foo-R-HSA-2644605.boo", "R-HSA-2644605"), + ("foo-R-HSA-2644605boo", "R-HSA-2644605"), + ("http://www.reactome.org/content/detail/R-HSA-2644605", "R-HSA-2644605"), ]) def test_extract_reactome_id(label, rid): assert rid == extract_reactome_id(label) -@pytest.mark.parametrize("label,title", [ - ("foo", "N/A"), - ("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer"), +@pytest.mark.parametrize("label,title,offline", [ + ("foo", "N/A", False), + ("foo", "N/A", True), + ("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer", False), + ("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer", True), ]) -def test_fetch_title(label, title): - assert title == fetch_title(label) +def test_fetch_title(test_data, label, title, offline): + kw = {} + if offline: + kw['pathways_db_path'] = test_data("reactome/ReactomePathways.txt") + + assert title == fetch_title(label, **kw)[0] + + +@pytest.mark.parametrize("offline", [True, False]) +def test_fetch_titles(test_data, offline): + kw = {} + if offline: + kw['pathways_db_path'] = test_data("reactome/ReactomePathways.txt") + + assert ['FBXW7 Mutants and NOTCH1 in Cancer', '2-LTR circle formation'] == fetch_title( + "R-HSA-2644605", "R-HSA-164843", **kw + ) def test_cli_help(capfd): @@ -35,7 +52,7 @@ def test_cli_help(capfd): "-h") output = capfd.readouterr() assert """python {}/reports/annotate_reactome.py -h -usage: annotate_reactome.py [-h] [-F FS] [-o PATH] PATH INT_OR_STR +usage: annotate_reactome.py [-h] [--db PATH] [-F FS] [-o PATH] PATH INT_OR_STR Annotates table containing http://www.reactome.org pathways ids (R-HSA-nnnnn) with pathways titles @@ -47,37 +64,48 @@ def test_cli_help(capfd): optional arguments: -h, --help show this help message and exit + --db PATH ReactomePathways.txt file path (default: None) -F FS Field separator, auto-detect if not specified (default: None) -o PATH Output path (default: None) """.format(PROJECT_ROOT_PATH) == output[0] assert "" == output[1] -@pytest.mark.parametrize("file,args,result_stdout,result_stderr,fresult", [ - ("empty.csv", ["1"], "", "File is empty: ", None), - ("no_header_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None), - ("no_header_comma.csv", ["0"], "", "Column index should be >= 1, but was: 0", None), - ("no_header_comma.csv", ["1"], "", "", "no_header_comma.result.txt"), - ("header_index_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None), - ("header_index_comma.csv", ["1"], "", "", "header_index_comma.result1.txt"), - ("header_index_comma.csv", ["''"], "", "", "header_index_comma.result2.txt"), - ("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None), - ("header_colname_comma.csv", ["1"], "", "", "header_colname_comma.result1.txt"), - ("header_colname_comma.csv", ["data"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None), - ("header_colname_comma.csv", ["data"], "", "", "header_colname_comma.result2.txt"), +@pytest.mark.parametrize("file,args,result_stdout,result_stderr,fresult,offline", [ + # offline + ("empty.csv", ["1"], "", "File is empty: ", None, True), + ("no_header_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True), + ("no_header_comma.csv", ["0"], "", "Column index should be >= 1, but was: 0", None, True), + ("no_header_comma.csv", ["1"], "", "", "no_header_comma.result.txt", True), + ("header_index_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True), + ("header_index_comma.csv", ["1"], "", "", "header_index_comma.result1.txt", True), + ("header_index_comma.csv", ["''"], "", "", "header_index_comma.result2.txt", True), + ("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True), + ("header_colname_comma.csv", ["1"], "", "", "header_colname_comma.result1.txt", True), + ("header_colname_comma.csv", ["data"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True), + ("header_colname_comma.csv", ["data"], "", "", "header_colname_comma.result2.txt", True), ("header_colname_comma.csv", ["data", "-F','"], "FBXW7 Mutants and NOTCH1 in Cancer", "", - None), - ("header_colname_tab.tsv", ["data"], "", "", "header_colname_comma.result2.txt"), - ("header_colname_tab.tsv", ["data", "-F'\t'"], "", "", "header_colname_tab.result.txt"), + None, True), + ("header_colname_tab.tsv", ["data"], "", "", "header_colname_comma.result2.txt", True), + ("header_colname_tab.tsv", ["data", "-F'\t'"], "", "", "header_colname_tab.result.txt", True), ("header_colname_comma.csv", ["data", "-F' '"], "", - "KeyError: 'the label [data] is not in the [columns]'", None), + "KeyError: 'the label [data] is not in the [columns]'", None, True), + + # using web access: + ("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, False), ]) -def test_foo(test_data, tmp_path, capfd, file, args, result_stdout, result_stderr, fresult): +def test_cli(test_data, tmp_path, capfd, file, args, result_stdout, result_stderr, fresult, + offline): + + pathways_db_path = test_data("reactome/ReactomePathways.txt") if offline else None input_path = str(test_data("reactome/" + file)) if fresult: args.append("-o " + str(tmp_path / "result.txt")) + if pathways_db_path: + args.append("--db " + pathways_db_path) + run( "python", "{}/reports/annotate_reactome.py".format(PROJECT_ROOT_PATH), input_path, diff --git a/test/testdata/reactome/ReactomePathways.txt b/test/testdata/reactome/ReactomePathways.txt new file mode 100644 index 0000000..739db97 --- /dev/null +++ b/test/testdata/reactome/ReactomePathways.txt @@ -0,0 +1,10 @@ +"R-ATH-73843" "5-Phosphoribose 1-diphosphate biosynthesis" "Arabidopsis thaliana" +"R-ATH-1369062" "ABC transporters in lipid homeostasis" "Arabidopsis thaliana" +"R-HSA-168276" " NS1 Mediated Effects on Host Pathways" "Homo sapiens" +"R-HSA-164843" "2-LTR circle formation" "Homo sapiens" +"R-HSA-73843" "5-Phosphoribose 1-diphosphate biosynthesis" "Homo sapiens" +"R-HSA-1971475" "A tetrasaccharide linker sequence is required for GAG synthesis" "Homo sapiens" +"R-HSA-5619084" "ABC transporter disorders" "Homo sapiens" +"R-HSA-2644605" "FBXW7 Mutants and NOTCH1 in Cancer" "Homo sapiens" +"R-XTR-379724" "tRNA Aminoacylation" "Xenopus tropicalis" +"R-XTR-199992" "trans-Golgi Network Vesicle Budding" "Xenopus tropicalis" \ No newline at end of file