-
Notifications
You must be signed in to change notification settings - Fork 3
/
CME1.py
106 lines (80 loc) · 2.88 KB
/
CME1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 20 16:23:53 2018
"""
#scraping CME is soooo effortless
#just simple html parse tree
#how i love Chicago
import urllib.request as u
import pandas as pd
from bs4 import BeautifulSoup as bs
import os
os.chdir('H:/')
#
def scrape(category_name,commodity_name):
#i use proxy handler cuz my uni network runs on its proxy
#and i cannot authenticate python through the proxy
#so i use empty proxy to bypass the authentication
proxy_handler = u.ProxyHandler({})
opener = u.build_opener(proxy_handler)
#cme officially forbids scraping
#so a header must be used for disguise as an internet browser
#the developers say no to scraping, it appears to be so
#but actually they turn a blind eye to us, thx
#i need different types of commodity
#so i need to format the website for each commodity
req=u.Request('http://www.cmegroup.com/trading/metals/%s/%s.html'%(
category_name,commodity_name),headers={'User-Agent': 'Mozilla/5.0'})
response=opener.open(req)
result=response.read()
soup=bs(result,'html.parser')
return soup
#
def etl(category_name,commodity_name):
try:
page=scrape(category_name,commodity_name)
print(commodity_name)
except Exception as e:
print(e)
#i need date, prior settle price and volume
#it is essential to view source of the website first
#then use beautiful soup to search specific class
p1=page.find_all('span',class_='cmeNoWrap')
p2=page.find_all('td',class_=['statusOK','statusNull','statusAlert'])
p3=page.find_all('td',class_="cmeTableRight")
a=[]
b=[]
c=[]
for i in p1:
a.append(i.text)
#somehow prior settle is hard to get
#we cannot find that specific tag
#we can search for the previous tag instead
#the find_next function of beautifulsoup allows us to get the next tag
#the previous tag of prior settle is change
for j in p2:
temp=j.find_next()
b.append(temp.text)
#the volume contains comma
for k in p3:
c.append(float(str(k).replace(',','')))
df=pd.DataFrame()
df['expiration date']=a
df['prior settle']=b
df['volume']=c
df['name']=commodity_name
#for me, i wanna highlight the front month
#The front month is the month where the majority of volume and liquidity occurs
df['front month']=df['volume']==max(df['volume'])
#
def main():
#scraping and etl
df1=etl('precious','silver')
df2=etl('precious','gold')
df3=etl('precious','palladium')
df4=etl('base','copper')
#concatenate then export
dd=pd.concat([df1,df2,df3,df4])
dd.to_csv('cme.csv',encoding='utf_8_sig')
if __name__ == "__main__":
main()