-
Notifications
You must be signed in to change notification settings - Fork 0
/
identify_code.py
167 lines (139 loc) · 4.63 KB
/
identify_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# coding=utf-8
__author__ = 'tech.chao'
from PIL import Image
import pytesseract
# 容错最大的有色判断
MAX_RGB_VALUE = 20
# 噪点大小
MAX_NOISY_COUNT = 25
# RGBA白色定义
WHITE_COLOR = (255, 255, 255, 255)
# RGBA黑色定义
BLACK_COLOR = (0, 0, 0, 255)
def print_char_pic(width, height, s_data):
"""
画出字符图, 空格为白色, 点为黑色
"""
_pic_str = ''
for y in range(0, height):
for x in range(0, width):
_point = s_data[y * width + x]
if _point == WHITE_COLOR:
_pic_str += ' '
else:
_pic_str += '*'
_pic_str += '\n'
# print _pic_str
def gen_white_black_points(image):
"""
根据点阵颜色强制转换黑白点
"""
data = image.getdata()
new_data = []
for item in data:
if item[0] > MAX_RGB_VALUE and item[1] > MAX_RGB_VALUE and item[2] > MAX_RGB_VALUE:
new_data.append(WHITE_COLOR)
else:
new_data.append(BLACK_COLOR)
return new_data
def reduce_noisy(width, height, points):
"""
横向扫描, 获取最大边界大小. 除去小于最大噪点大小的面积.
"""
# 标记位置, 初始化都是0, 未遍历过
flag_list = []
for i in xrange(width * height):
flag_list.append(0)
# 遍历
for index, value in enumerate(points):
_y = index // width
_x = index - _y * width
# print _x, _y
if flag_list[index] == 0 and value == BLACK_COLOR:
flag_list[index] = 1
_tmp_list = [index]
recursion_scan_black_point(_x, _y, width, height, _tmp_list, flag_list, points)
if len(_tmp_list) <= MAX_NOISY_COUNT:
for x in _tmp_list:
points[x] = WHITE_COLOR
else:
flag_list[index] = 1
def recursion_scan_black_point(x, y, width, height, tmp_list, flag_list, points):
# 左上
if 0 <= (x - 1) < width and 0 <= (y - 1) < height:
_x = x - 1
_y = y - 1
_inner_recursion(_x, _y, width, height, tmp_list, flag_list, points)
# 上
if 0 <= (y - 1) < height:
_x = x
_y = y - 1
_inner_recursion(_x, _y, width, height, tmp_list, flag_list, points)
# 右上
if 0 <= (x + 1) < width and 0 <= (y - 1) < height:
_x = x + 1
_y = y - 1
_inner_recursion(_x, _y, width, height, tmp_list, flag_list, points)
# 左
if 0 <= (x - 1) < width:
_x = x - 1
_y = y
_inner_recursion(_x, _y, width, height, tmp_list, flag_list, points)
# 右
if 0 <= (x + 1) < width:
_x = x + 1
_y = y
_inner_recursion(_x, _y, width, height, tmp_list, flag_list, points)
# 左下
if 0 <= (x - 1) < width and 0 <= (y + 1) < height:
_x = x - 1
_y = y + 1
_inner_recursion(_x, _y, width, height, tmp_list, flag_list, points)
# 下
if 0 <= (y + 1) < height:
_x = x
_y = y + 1
_inner_recursion(_x, _y, width, height, tmp_list, flag_list, points)
# 右下
if 0 <= (x + 1) < width and 0 <= (y + 1) < height:
_x = x + 1
_y = y + 1
_inner_recursion(_x, _y, width, height, tmp_list, flag_list, points)
def _inner_recursion(new_x, new_y, width, height, tmp_list, flag_list, points):
_index = new_x + width * new_y
if flag_list[_index] == 0 and points[_index] == BLACK_COLOR:
tmp_list.append(_index)
flag_list[_index] = 1
recursion_scan_black_point(new_x, new_y, width, height, tmp_list, flag_list, points)
else:
flag_list[_index] = 1
def recognize_url(url):
import urllib
urllib.urlretrieve(url, "imgs/tmp-img.png")
img = Image.open('imgs/tmp-img.png')
img = img.convert('RGBA')
w, h = img.size[0], img.size[1]
# print w, h
point_list = gen_white_black_points(img)
print_char_pic(w, h, point_list)
reduce_noisy(w, h, point_list)
print_char_pic(w, h, point_list)
img.putdata(point_list)
img.save("imgs/rebuild.png")
return pytesseract.image_to_string(Image.open('imgs/rebuild.png'))
if __name__ == '__main__':
# img = Image.open('imgs/captcha-1.jpg')
# img = img.convert('RGBA')
# w, h = img.size[0], img.size[1]
# print w, h
# point_list = gen_white_black_points(img)
# print_char_pic(w, h, point_list)
# reduce_noisy(w, h, point_list)
# print_char_pic(w, h, point_list)
#
#
# img.putdata(point_list)
# img.save("imgs/rebuild.jpg")
#
# print pytesseract.image_to_string(Image.open('imgs/rebuild.jpg'))
print recognize_url('https://www.douban.com/misc/captcha?id=aF6L4Uwgnvg2KSFZLJe6apmr:en&size=s')