Skip to content

Commit

Permalink
Annotate Reactome using offline database files #35
Browse files Browse the repository at this point in the history
  • Loading branch information
iromeo committed Dec 11, 2017
1 parent 6351208 commit fb3f91a
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 54 deletions.
77 changes: 54 additions & 23 deletions reports/annotate_reactome.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,45 @@ def inner(*args, sleep_s=sleep_s, **kw):
return deco


@_lxml_try(sleep_s=0.01)
def fetch_title(s):
reactome_id = extract_reactome_id(s)

if not reactome_id:
return "N/A"
def fetch_title(*strs, pathways_db_path=None):
if pathways_db_path:
df = pd.read_csv(pathways_db_path, sep="\t", header=None,
names=["title", "species"], index_col=0)
else:
url = "http://www.reactome.org/content/detail/" + reactome_id
t = lxml.html.parse(url)
return t.find(".//title").text.replace("Reactome | ", "")
df = None

titles = []
for s in strs:
reactome_id = extract_reactome_id(s)

if reactome_id:
if df is not None:
title = _db_fetch_title(reactome_id, df)
else:
title = _web_fetch_title(reactome_id)
else:
title = "N/A"

titles.append(title)

return titles


def _db_fetch_title(reactome_id, df):
return df.title.loc[reactome_id]


@_lxml_try(sleep_s=0.01)
def _web_fetch_title(reactome_id):
url = "http://www.reactome.org/content/detail/" + reactome_id
t = lxml.html.parse(url)
return t.find(".//title").text.replace("Reactome | ", "")


def extract_reactome_id(s):
match = re.match(".*R-HSA-(\d+).*", str(s), flags=re.IGNORECASE)
if match:
reactome_id = match.group(1)
reactome_id = "R-HSA-{}".format(match.group(1))
else:
reactome_id = None
return reactome_id
Expand All @@ -64,6 +87,8 @@ def _cli():
)
parser.add_argument('path', metavar="PATH",
help="Table path")
parser.add_argument('--db', metavar="PATH",
help="ReactomePathways.txt file path")

parser.add_argument('col', metavar="INT_OR_STR",
help="Index (>=1) or name of column containing Reactome pathway id. If "
Expand All @@ -80,6 +105,7 @@ def _cli():
sep = args.sep
col = args.col
output_path = args.output_path
pathways_db_path = args.db

try:
col_i = int(col)
Expand Down Expand Up @@ -108,21 +134,26 @@ def _cli():
if col_idx is None:
print("Table header:", ", ".join("'{}'".format(c) for c in df.columns), file=sys.stderr)
if index_col is not None:
reactome_ids = df.index
reactome_links = df.index
else:
reactome_ids = df.loc[:, col]
reactome_links = df.loc[:, col]
else:
reactome_ids = df.iloc[:, col_idx]

titles = []
print("Fetch titles for:", len(reactome_ids), "records", file=sys.stderr)
for i, s in enumerate(reactome_ids, 1):
titles.append(fetch_title(s))
if i % 100 == 0:
print('\n{} of {}'.format(i, len(reactome_ids)), file=sys.stderr)
else:
print('.', end="", file=sys.stderr)
sys.stderr.flush()
reactome_links = df.iloc[:, col_idx]

print("Fetch titles for:", len(reactome_links), "records", file=sys.stderr)
if not pathways_db_path:
# online mode:
titles = []
for i, s in enumerate(reactome_links, 1):
titles.append(fetch_title(s))
if i % 100 == 0:
print('\n{} of {}'.format(i, len(reactome_links)), file=sys.stderr)
else:
print('.', end="", file=sys.stderr)
sys.stderr.flush()
else:
# offline mode
titles = fetch_title(*reactome_links, pathways_db_path=pathways_db_path)
print(file=sys.stderr)

df['reactome_titles'] = titles
Expand Down
90 changes: 59 additions & 31 deletions test/test_annotate_reactome.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,33 +9,50 @@
@pytest.mark.parametrize("label,rid", [
("foo", None),
("2644605", None),
("R-HSA-2644605", "2644605"),
("r-hsa-2644605", "2644605"),
("foo_R-HSA-2644605_boo", "2644605"),
("foo_r-hsa-2644605_boo", "2644605"),
("foo.R-HSA-2644605-boo", "2644605"),
("foo-R-HSA-2644605.boo", "2644605"),
("foo-R-HSA-2644605boo", "2644605"),
("http://www.reactome.org/content/detail/R-HSA-2644605", "2644605"),
("R-HSA-2644605", "R-HSA-2644605"),
("r-hsa-2644605", "R-HSA-2644605"),
("foo_R-HSA-2644605_boo", "R-HSA-2644605"),
("foo_r-hsa-2644605_boo", "R-HSA-2644605"),
("foo.R-HSA-2644605-boo", "R-HSA-2644605"),
("foo-R-HSA-2644605.boo", "R-HSA-2644605"),
("foo-R-HSA-2644605boo", "R-HSA-2644605"),
("http://www.reactome.org/content/detail/R-HSA-2644605", "R-HSA-2644605"),
])
def test_extract_reactome_id(label, rid):
assert rid == extract_reactome_id(label)


@pytest.mark.parametrize("label,title", [
("foo", "N/A"),
("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer"),
@pytest.mark.parametrize("label,title,offline", [
("foo", "N/A", False),
("foo", "N/A", True),
("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer", False),
("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer", True),
])
def test_fetch_title(label, title):
assert title == fetch_title(label)
def test_fetch_title(test_data, label, title, offline):
kw = {}
if offline:
kw['pathways_db_path'] = test_data("reactome/ReactomePathways.txt")

assert title == fetch_title(label, **kw)[0]


@pytest.mark.parametrize("offline", [True, False])
def test_fetch_titles(test_data, offline):
kw = {}
if offline:
kw['pathways_db_path'] = test_data("reactome/ReactomePathways.txt")

assert ['FBXW7 Mutants and NOTCH1 in Cancer', '2-LTR circle formation'] == fetch_title(
"R-HSA-2644605", "R-HSA-164843", **kw
)


def test_cli_help(capfd):
run("python", "{}/reports/annotate_reactome.py".format(PROJECT_ROOT_PATH),
"-h")
output = capfd.readouterr()
assert """python {}/reports/annotate_reactome.py -h
usage: annotate_reactome.py [-h] [-F FS] [-o PATH] PATH INT_OR_STR
usage: annotate_reactome.py [-h] [--db PATH] [-F FS] [-o PATH] PATH INT_OR_STR
Annotates table containing http://www.reactome.org pathways ids (R-HSA-nnnnn)
with pathways titles
Expand All @@ -47,37 +64,48 @@ def test_cli_help(capfd):
optional arguments:
-h, --help show this help message and exit
--db PATH ReactomePathways.txt file path (default: None)
-F FS Field separator, auto-detect if not specified (default: None)
-o PATH Output path (default: None)
""".format(PROJECT_ROOT_PATH) == output[0]
assert "" == output[1]


@pytest.mark.parametrize("file,args,result_stdout,result_stderr,fresult", [
("empty.csv", ["1"], "", "File is empty: ", None),
("no_header_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
("no_header_comma.csv", ["0"], "", "Column index should be >= 1, but was: 0", None),
("no_header_comma.csv", ["1"], "", "", "no_header_comma.result.txt"),
("header_index_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
("header_index_comma.csv", ["1"], "", "", "header_index_comma.result1.txt"),
("header_index_comma.csv", ["''"], "", "", "header_index_comma.result2.txt"),
("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
("header_colname_comma.csv", ["1"], "", "", "header_colname_comma.result1.txt"),
("header_colname_comma.csv", ["data"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
("header_colname_comma.csv", ["data"], "", "", "header_colname_comma.result2.txt"),
@pytest.mark.parametrize("file,args,result_stdout,result_stderr,fresult,offline", [
# offline
("empty.csv", ["1"], "", "File is empty: ", None, True),
("no_header_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
("no_header_comma.csv", ["0"], "", "Column index should be >= 1, but was: 0", None, True),
("no_header_comma.csv", ["1"], "", "", "no_header_comma.result.txt", True),
("header_index_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
("header_index_comma.csv", ["1"], "", "", "header_index_comma.result1.txt", True),
("header_index_comma.csv", ["''"], "", "", "header_index_comma.result2.txt", True),
("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
("header_colname_comma.csv", ["1"], "", "", "header_colname_comma.result1.txt", True),
("header_colname_comma.csv", ["data"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
("header_colname_comma.csv", ["data"], "", "", "header_colname_comma.result2.txt", True),
("header_colname_comma.csv", ["data", "-F','"], "FBXW7 Mutants and NOTCH1 in Cancer", "",
None),
("header_colname_tab.tsv", ["data"], "", "", "header_colname_comma.result2.txt"),
("header_colname_tab.tsv", ["data", "-F'\t'"], "", "", "header_colname_tab.result.txt"),
None, True),
("header_colname_tab.tsv", ["data"], "", "", "header_colname_comma.result2.txt", True),
("header_colname_tab.tsv", ["data", "-F'\t'"], "", "", "header_colname_tab.result.txt", True),
("header_colname_comma.csv", ["data", "-F' '"], "",
"KeyError: 'the label [data] is not in the [columns]'", None),
"KeyError: 'the label [data] is not in the [columns]'", None, True),
# using web access:
("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, False),
])
def test_foo(test_data, tmp_path, capfd, file, args, result_stdout, result_stderr, fresult):
def test_cli(test_data, tmp_path, capfd, file, args, result_stdout, result_stderr, fresult,
offline):

pathways_db_path = test_data("reactome/ReactomePathways.txt") if offline else None
input_path = str(test_data("reactome/" + file))

if fresult:
args.append("-o " + str(tmp_path / "result.txt"))

if pathways_db_path:
args.append("--db " + pathways_db_path)

run(
"python", "{}/reports/annotate_reactome.py".format(PROJECT_ROOT_PATH),
input_path,
Expand Down
10 changes: 10 additions & 0 deletions test/testdata/reactome/ReactomePathways.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"R-ATH-73843" "5-Phosphoribose 1-diphosphate biosynthesis" "Arabidopsis thaliana"
"R-ATH-1369062" "ABC transporters in lipid homeostasis" "Arabidopsis thaliana"
"R-HSA-168276" " NS1 Mediated Effects on Host Pathways" "Homo sapiens"
"R-HSA-164843" "2-LTR circle formation" "Homo sapiens"
"R-HSA-73843" "5-Phosphoribose 1-diphosphate biosynthesis" "Homo sapiens"
"R-HSA-1971475" "A tetrasaccharide linker sequence is required for GAG synthesis" "Homo sapiens"
"R-HSA-5619084" "ABC transporter disorders" "Homo sapiens"
"R-HSA-2644605" "FBXW7 Mutants and NOTCH1 in Cancer" "Homo sapiens"
"R-XTR-379724" "tRNA Aminoacylation" "Xenopus tropicalis"
"R-XTR-199992" "trans-Golgi Network Vesicle Budding" "Xenopus tropicalis"

0 comments on commit fb3f91a

Please sign in to comment.