-
Notifications
You must be signed in to change notification settings - Fork 0
/
module_3.py
51 lines (39 loc) · 1.58 KB
/
module_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 29 16:04:46 2020
@author: Yihang Zhou
Contact: [email protected]
https://github.com/Y-H-Joe/
####============================ description ==============================####
## module 3
Given the ASSEMBLY, web-scrap the "Total ungapped length" from NCBI website.
Write the genome_size to genome_size file in name-folder.
#================================== input =====================================
#================================== output ====================================
#================================ parameters ==================================
#================================== example ===================================
#================================== warning ===================================
####=======================================================================####
"""
import requests
import random
import time
from bs4 import BeautifulSoup
import sys
import os
#os.system('taskset -p %s' %os.getpid())
assembly=sys.argv[1]
name_folder_dir=sys.argv[2]
time.sleep(random.choice(range(1,99))) ## in case IP restriction
## scrap the genome_size=Total ungapped length
url=str("https://www.ncbi.nlm.nih.gov/assembly/"+assembly+"/")
print("module 3: requesting ",url)
html=requests.get(url)
bf=BeautifulSoup(html.text,features="lxml")
len_str=bf.find("td",string="Total ungapped length").next_sibling.text
len_int=int(len_str.replace(",",""))
## write the genome size
print("module 3: writing genome_size.")
with open(str(name_folder_dir+"/genome_size"),'w') as f:
f.write(str(len_int))