-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 49670e0
Showing
16 changed files
with
1,417 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Auto detect text files and perform LF normalization | ||
* text=auto |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
"""Config values.""" | ||
from os import environ | ||
|
||
|
||
class Config: | ||
|
||
# Database config | ||
db_user = environ.get('DATABASE_USERNAME') | ||
db_password = environ.get('DATABASE_PASSWORD') | ||
db_host = environ.get('DATABASE_HOST') | ||
db_port = environ.get('DATABASE_PORT') | ||
db_name = environ.get('DATABASE_NAME') | ||
|
||
#We're pulling the values for each of these from a .env file: | ||
# a practice I highly recommend for security purposes. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
class Database: | ||
"""Database connection class.""" | ||
|
||
def __init__(self, config): | ||
self.host = config.db_host | ||
self.username = config.db_user | ||
self.password = config.db_password | ||
self.port = config.db_port | ||
self.dbname = config.db_name | ||
self.conn = None | ||
|
||
# Initializing this class saves our database connection variables to the instance of the class, | ||
# as well as creates a self.conn variable for managing connections. | ||
# We create an instance of this class by passing our config object to Database: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
First things first, we need to create a config file. Unlike SQLAlchemy, PyMySQL doesn't support database URI strings out of the box. Because of this, we need to set variables for each part of a database connection such as username, password, etc.: | ||
|
||
|
||
|
||
""" | ||
try : | ||
db = Database(config) | ||
except: | ||
print("Database could not be inizialized") | ||
|
||
class Database: | ||
"""Database connection class.""" | ||
def open_connection(self): | ||
"""Connect to MySQL Database.""" | ||
try: | ||
if self.conn is None: | ||
self.conn = pymysql.connect(self.host, | ||
user=self.username, | ||
passwd=self.password, | ||
db=self.dbname, | ||
connect_timeout=5) | ||
except pymysql.MySQLError as e: | ||
logging.error(e) | ||
sys.exit() | ||
finally: | ||
logging.info('Connection opened successfully.') """ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,253 @@ | ||
import urllib | ||
from urllib.request import urlopen | ||
from urllib.error import HTTPError | ||
from urllib.error import URLError | ||
import urllib.parse | ||
|
||
import datetime | ||
import random | ||
import re | ||
import sys | ||
import pymysql | ||
import logging | ||
import pymysql.cursors | ||
import requests as rq | ||
from SQLConnection import DBHelper | ||
import linecache | ||
import webbrowser | ||
import numpy as np | ||
# Now, let’s use requests_html to run the JavaScript code in order to render the HTML we’re looking for. | ||
# import HTMLSession from requests_html | ||
from selenium import webdriver | ||
from selenium.webdriver.common.keys import Keys | ||
from selenium.webdriver.support import expected_conditions as EC | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.common.exceptions import TimeoutException | ||
import chromedriver_binary # Adds chromedriver binary to path | ||
from webdriver_manager.chrome import ChromeDriverManager | ||
|
||
#from autotest_lib.client.common_lib.cros import chromedriver | ||
|
||
try: | ||
from requests_html import HTMLSession | ||
except ImportError: | ||
try: | ||
import requests_html | ||
except ImportError: | ||
print ("Te system can't find requests_html. BLAST search results will not" | ||
+ " be parse without this module.") | ||
try: | ||
from bs4 import BeautifulSoup | ||
except ImportError: | ||
try: | ||
import BeautifulSoup | ||
except ImportError: | ||
print ("Te system can't find BeautifulSoup. BLAST search results will not" | ||
+ " be parse without this module.") | ||
|
||
|
||
def PrintException(): | ||
exc_type, exc_obj, tb = sys.exc_info() | ||
f = tb.tb_frame | ||
lineno = tb.tb_lineno | ||
filename = f.f_code.co_filename | ||
linecache.checkcache(filename) | ||
line = linecache.getline(filename, lineno, f.f_globals) | ||
print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)) | ||
|
||
|
||
def store(title, content, url): | ||
print(title, content, url) | ||
cur.execute("INSERT INTO pdb.pages (title, content, url) VALUES (\"%s\", \"%s\", \"%s\")", (title,content,url,)) | ||
cur.connection.commit() | ||
|
||
|
||
def getLinks(articleUrl): | ||
html = urlopen(articleUrl) | ||
bsObj = BeautifulSoup(html, 'lxml') | ||
try : | ||
title = bsObj.find("h1").get_text() | ||
print(title) | ||
except : | ||
PrintException() | ||
try : | ||
content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text() | ||
print(content) | ||
except : | ||
PrintException() | ||
|
||
|
||
# Open database connection | ||
connection = pymysql.connect(host='localhost', port=int(5000) , user='root', password='Br_4912#862', db='mysql', charset='utf8') | ||
|
||
# prepare a cursor object using cursor() method | ||
cur = connection.cursor() | ||
# Drop table if it already eist using execute() method. | ||
cur.execute("DROP TABLE IF EXISTS pages;") | ||
print(cur.fetchone()) | ||
from string import ascii_lowercase | ||
|
||
# CREATE TABLE as per requirement | ||
sql = """CREATE TABLE pages ( | ||
id | ||
BIGINT(7) not NULL AUTO_INCREMENT, | ||
title VARCHAR(200), | ||
content VARCHAR(10000), | ||
created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | ||
PRIMARY KEY (id) | ||
);""" | ||
|
||
try: | ||
cur.execute(sql) | ||
connection.commit() | ||
cur.execute("SELECT * FROM pdb.pages") | ||
print(cur.fetchone()) | ||
except: | ||
# Rollback in case there is any error | ||
PrintException() | ||
connection.rollback() | ||
|
||
|
||
def xpath_soup(element): | ||
""" | ||
Generate xpath of soup element | ||
:param element: bs4 text or node | ||
:return: xpath as string | ||
""" | ||
components = [] | ||
child = element if element.name else element.parent | ||
for parent in child.parents: | ||
""" | ||
@type parent: bs4.element.Tag | ||
""" | ||
previous = itertools.islice(parent.children, 0,parent.contents.index(child)) | ||
xpath_tag = child.name | ||
xpath_index = sum(1 for i in previous if i.name == xpath_tag) + 1 | ||
components.append(xpath_tag if xpath_index == 1 else '%s[%d]' % (xpath_tag, xpath_index)) | ||
child = parent | ||
components.reverse() | ||
return '/%s' % '/'.join(components) | ||
|
||
def get_pdb_page(link): | ||
""" | ||
Generate xpath of soup element | ||
:param element: bs4 text or node | ||
:return: xpath as string | ||
""" | ||
browser = webdriver.Chrome(ChromeDriverManager().install()) | ||
|
||
try : | ||
test = urllib.parse.urljoin("http://www.rcsb.org/",link) | ||
|
||
link = urlopen(test) | ||
bsObj = BeautifulSoup(link, 'lxml') | ||
title = bsObj.find("span", {"id":"structureID"}).get_text() | ||
content = bsObj.find("div", {"id":"primarycitation"}).find("h4").get_text() | ||
paper = bsObj.find("li", {"id":"pubmedDOI"}).find_all("a",href=re.compile("^(http://dx.doi.org/)((?!:).)*$")) | ||
paper = paper[random.randint(0, len(paper)-1)].attrs["href"] | ||
print(title, content, paper) | ||
store(title, content, newArticle) | ||
except : | ||
PrintException() | ||
|
||
|
||
def get_links(url) : | ||
# the links on rcsb are made with js, | ||
# so the approach needs to be different | ||
# create an HTML Session object | ||
session = HTMLSession() | ||
|
||
# Use the object above to connect to needed webpage | ||
resp = session.get(url) | ||
|
||
# Run JavaScript code on webpage | ||
# Running resp.html will give us an object that allows us to print out, | ||
# +search through, and perform several functions on the webpage’s HTML | ||
resp.html.render() | ||
|
||
# To simulate running the JavaScript code, | ||
# we use the render method on the resp.html object. | ||
# Note how we don’t need to set a variable equal to this rendered result | ||
resp.html.html | ||
|
||
# So now resp.html.html contains the HTML we need containing the option tags. | ||
# From here, we can parse out the expiration dates from these tags using the find method. | ||
|
||
|
||
soup = BeautifulSoup(resp.html.html, "lxml") | ||
with open("output1.html", "w", encoding='utf-8') as file: | ||
file.write(str(soup)) | ||
links = soup.find("ul", {"id":"SearchResultsDetails-MainContent"}).find_all("a",href=re.compile("^(/structure/)((?!:).)*$")) | ||
PDBs = [] | ||
PDBs = np.array(PDBs, dtype = np.float32) | ||
try : | ||
for x in range(len(links)) : | ||
newArticle = links[random.randint(0, len(links)-1)].attrs["href"] | ||
# here you will get each link, | ||
# which points to a particular crystall structure | ||
print(newArticle) | ||
PDBs = np.append(PDBs,newArticle) | ||
# Here you use the dictionary | ||
except: | ||
PrintException() | ||
pass | ||
PDBs = np.unique(PDBs) | ||
for y in range(len(PDBs)): | ||
print(PDBs[y]) | ||
get_pdb_page(PDBs[y]) | ||
|
||
def get_Search(word) : | ||
url = 'http://www.rcsb.org/' | ||
browser = webdriver.Chrome(ChromeDriverManager().install()) | ||
browser.get(url) | ||
browser.implicitly_wait(10) # this lets webdriver wait 10 seconds for the website to load | ||
|
||
search_bar = '//*[@id="autosearch_SearchBar"]' | ||
search_bar = WebDriverWait(browser,30).until(EC.visibility_of_element_located((By.XPATH ,search_bar))) | ||
search_bar.send_keys(word) | ||
|
||
browser.implicitly_wait(10) | ||
btn_go = '//*[@id="searchbutton"]' | ||
btn_go = WebDriverWait(browser,30).until(EC.visibility_of_element_located((By.XPATH ,btn_go))) | ||
btn_go.click() | ||
|
||
print(browser.current_url) | ||
#get_links(browser.current_url) | ||
while True : | ||
try : | ||
browser.implicitly_wait(30) | ||
btn_next = '//*[@id="toppager"]/div/button[2]' | ||
btn_next = WebDriverWait(browser,30).until(EC.visibility_of_element_located((By.XPATH , btn_next))) | ||
btn_next.click() | ||
print(browser.current_url) | ||
try : | ||
get_links(browser.current_url) | ||
except: | ||
PrintException() | ||
pass | ||
except: | ||
PrintException() | ||
pass | ||
|
||
|
||
# Start the Script | ||
|
||
text_query = "adenosine" | ||
#get_Search(text_query) | ||
|
||
cur.execute("SELECT * FROM pdb.pages ORDER BY title;") | ||
print(cur.fetchone()) | ||
# This query returns data from the contacts table: | ||
cur.execute("SELECT title, COUNT(title) FROM pdb.pages GROUP BY title HAVING COUNT(title) > 1;") | ||
# The following query returns the duplicate emails in the contacts table: | ||
cur.execute("DELETE t1 FROM pdb.pages t1 INNER JOIN pdb.pages t2 WHERE t1.id < t2.id AND t1.title = t2.title;") | ||
# The following query renumber the id columns starting from 1 | ||
cur.execute("SET @i=0;") | ||
cur.execute("UPDATE pdb.pages SET id=(@i:=@i+1);") | ||
connection.commit() | ||
print(cur.fetchone()) | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import pymysql | ||
|
||
class DBHelper: | ||
|
||
def __init__(self): | ||
self.host = "127.0.0.1" | ||
self.user = "root" | ||
self.port = int(5000) | ||
self.password = "Br_4912#862" | ||
self.db = "mysql" | ||
self.charset = 'utf8' | ||
|
||
def __connect__(self): | ||
self.con = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password, db=self.db, cursorclass=pymysql.cursors. | ||
DictCursor, charset=self.charset) | ||
self.cur = self.con.cursor() | ||
|
||
def __disconnect__(self): | ||
self.con.close() | ||
|
||
def fetch(self, sql): | ||
self.__connect__() | ||
self.cur.execute(sql) | ||
result = self.cur.fetchall() | ||
self.__disconnect__() | ||
return result | ||
|
||
def execute(self, sql): | ||
self.__connect__() | ||
self.cur.execute(sql) | ||
self.__disconnect__() |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
-- Create a new database called 'pdb' | ||
-- Connect to the 'master' database to run this snippet | ||
USE master | ||
GO | ||
-- Create the new database if it does not exist already | ||
IF NOT EXISTS ( | ||
SELECT name | ||
FROM sys.databases | ||
WHERE name = N'pdb' | ||
) | ||
CREATE DATABASE pdb; | ||
USE pdb; | ||
|
||
CREATE TABLE pages ( | ||
id | ||
BIGINT(7) not NULL AUTO_INCREMENT, | ||
title VARCHAR(200), | ||
content VARCHAR(10000), | ||
url VARCHAR(255) NOT NULL, | ||
created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | ||
PRIMARY KEY (id) | ||
); | ||
|
||
INSERT INTO pages (title, content, url) VALUES("Test page title","this is some test page content. It can be up to 10,000 characters long", "https://github.com/BruzzeseAgustin"); | ||
ALTER DATABASE pdb CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci; | ||
ALTER TABLE pages CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; | ||
ALTER TABLE pages CHANGE title title VARCHAR(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; | ||
ALTER TABLE pages CHANGE content content VARCHAR(10000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; | ||
ALTER TABLE pages CHANGE url url VARCHAR(10000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; | ||
|
||
DESCRIBE pages; |
Oops, something went wrong.