-
Notifications
You must be signed in to change notification settings - Fork 173
/
SHFE.py
116 lines (88 loc) · 3.29 KB
/
SHFE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 29 10:48:35 2018
"""
#Shanghai Future Exchange's daily price is stored in a dat file
#when you make a query on the website
#the page runs jquery to get the json file
#then convert json to dat file and put it on the website table
#the process can be tracked by inspect element
#the logic of getting dat file is pretty much the same as cme2
# https://github.com/je-suis-tm/web-scraping/blob/master/CME2.py
#theoretically speaking, we can use the same trick as json
#here, we apply a more general way to process it
#regular expression a.k.a. regex
#regex can work on any sort of text extraction
#when we cannot extract text from html parse tree
#or maybe we just need a part of the text
#regex is the most efficient way
#even for a simple html parse tree
#we can still convert response.content to string first
#and apply regex to extract what we need later
#regex in python is the same as regex in any other languages
#the rules of regex syntax are basically universal
#check the link below to see more details of regex
# https://www.w3schools.com/python/python_regex.asp
import requests
import pandas as pd
import re
import datetime as dt
import os
os.chdir('H:/')
#this function is to format the date
#the date format of SHFE is yyyymmdd
#alternatively, u can use strftime
def format_date():
year=str(dt.datetime.now().year)
month=(dt.datetime.now().month)
#i normally get t-1 prices
day=(dt.datetime.now().day)-1
datetime=str(pd.to_datetime(f'{year}-{month}-{day}'))
date=datetime[:10].replace('-','')
return date
#
def scrape(date):
session=requests.Session()
response = session.get('http://www.shfe.com.cn/data/dailydata/kx/kx%s.dat'%(date))
return response.content
#
def etl(content):
#if we look closely at dat file, it is just json in another format
#all we need to do is to discover the pattern of where the data is stored
#all the price data i care about are behind colon :
#regex lookahead will do the trick
numbers=re.findall('(?<=:)-?\d*\.?\d*',content.decode('utf_8-sig'))
#i only need the close price, which is the expression of slicing 9::16
#and i only need certain types of commodity
temp=numbers[9::16]
cu=temp[0:12]
al=temp[13:25]
zn=temp[26:38]
pb=temp[39:51]
ni=temp[52:64]
au=temp[78:86]
ag=temp[87:99]
frb=temp[100:112]
#customize the format based on my requirement
group=al+['','']+cu+['','']+zn+['','']+pb+['','']+ni
upload=[al[0]]+cu[0:3]+zn[0:3]+pb[0:3]+frb[0:2]+[ag[2]]+['']+[au[2]]+ni[0:2]+[ni[3]]+[0]*50
df=pd.DataFrame(upload)
df['upload']=group
df['al extra']=al[1]
return df
#this is the regex to get date of each contract
#even though price and date are both stored in the same file
#date has quotation marks, price doesnt
#i dont need date, if u need it, just use the regex below
"""
temp=re.findall('(?<=")\d*(?=")',content.decode('utf_8-sig'))
date=temp[0:12]
"""
#
def main():
date=format_date()
content=scrape(date)
df=etl(content)
df.to_csv('murex update.csv')
if __name__ == "__main__":
main()