-
Notifications
You must be signed in to change notification settings - Fork 1
/
classify.py
93 lines (86 loc) · 2.54 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import pandas as pd
import re
import numpy
m = ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP','INTP', 'ESTP', 'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ', 'ENFJ', 'ENTJ']
postdata = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
# Read file
def getFile():
fileName = './mbti_1.csv'
filedata = pd.read_csv(fileName)
data = list(zip(filedata.type, filedata.posts))
return data
d = getFile()
# Classify MBTI Types, 3-dimension array
for (t,p) in d:
loc = m.index(t)
postdata[loc].append(p)
# Join Total posts of each MBTI Types, 2-dimension array
tempdata = []
for item in postdata:
tempdata.append('|||'.join(item))
# print(len(tempdata))
newdata = []
for text in tempdata:
# URL
text = re.sub('(http|ftp|https|uhttp|$uhttp)://(?:[-\w.]|(?=:%[\da-fA-F]{2}))+', ' ', text)
text = re.sub('http://([\d\w\./])+/[\d\w\./]+ ', ' ', text)
text = re.sub('http://([\d\w\./])+/[\d\w\./]+.', ' ', text)
text = re.sub('http : \*\*([\d;])+;TOOLONG', ' ', text)
# html tags
text = re.sub(r'<[^>]+>', ' ', text)
# email
text = re.sub('([a-z0-9_.+-]+@[a-z0-9-]+\.[a-z0-9-.]+)', ' ', text)
# image formats
text = re.sub('[\d\w\./]+.(jpg|jpeg|png|pdf|gif|bmp)', ' ', text)
# minimize blank spaces
text = re.sub(' +', ' ', text)
# split to list
text_split = text.split('|||')
#
# Delete blank entry
t = list(map(lambda s: s.strip(), text_split))
t = list(filter(None, t))
# print(len(t))
newdata.append(t)
# WriteFile
# data = {
# 'ISTJ': [newdata[0]],
# 'ISFJ': [newdata[1]],
# 'INFJ': [newdata[2]],
# 'INTJ': [newdata[3]],
# 'ISTP': [newdata[4]],
# 'ISFP': [newdata[5]],
# 'INFP': [newdata[6]],
# 'INTP': [newdata[7]],
# 'ESTP': [newdata[8]],
# 'ESFP': [newdata[9]],
# 'ENFP': [newdata[10]],
# 'ENTP': [newdata[11]],
# 'ESTJ': [newdata[12]],
# 'ESFJ': [newdata[13]],
# 'ENFJ': [newdata[14]],
# 'ENTJ': [newdata[15]]
# }
data = {
'ISTJ': newdata[0],
'ISFJ': newdata[1],
'INFJ': newdata[2],
'INTJ': newdata[3],
'ISTP': newdata[4],
'ISFP': newdata[5],
'INFP': newdata[6],
'INTP': newdata[7],
'ESTP': newdata[8],
'ESFP': newdata[9],
'ENFP': newdata[10],
'ENTP': newdata[11],
'ESTJ': newdata[12],
'ESFJ': newdata[13],
'ENFJ': newdata[14],
'ENTJ': newdata[15]
}
dd = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in data.items() ]))
# data_df = pd.DataFrame.from_dict(data)
# data_fin = data_df.transpose()
dd.to_csv('./data-clean2.csv', sep=',')