-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert_file_to_utf-8.py
54 lines (48 loc) · 2.23 KB
/
convert_file_to_utf-8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Ficapy
# Create: '15/7/16'
# FUCK YOU GB2312
import codecs
import os
import shutil
from chardet.universaldetector import UniversalDetector
from os import path
dir = path.dirname(path.abspath(__file__))
detector = UniversalDetector()
for root, dirnames, files in os.walk(dir):
for file in files:
detector.reset()
filepath = path.join(root, file)
for line in open(filepath, 'rb'):
detector.feed(line)
if detector.done:
break
detector.close()
encode = detector.result.get('encoding')
utf8path = path.dirname(filepath.replace(dir, path.join(dir, 'utf-8')))
# 适用于py3,懒得兼容2了
os.makedirs(utf8path, exist_ok=True)
dst = path.join(utf8path, file)
# http://stackoverflow.com/questions/191359/how-to-convert-a-file-to-utf-8-in-python
BLOCKSIZE = 1048576 # or some other, desired size in bytes
if encode:
try:
with codecs.open(filepath, "r", encode) as sourceFile:
with codecs.open(dst, "w", "utf-8") as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
targetFile.write(contents)
# 日了狗了🐶,osx上自带iconv不支持-o参数
# subprocess.call(['iconv', '-f', encode, '-t', 'UTF-8', filepath, '-o', dst],stdout=subprocess.DEVNULL)
except:
shutil.copyfile(filepath, dst)
print('{file}文件转换错误,识别编码为{encode},确认度为{confidence}'.format(
**{'file':filepath,
'encode':encode,
'confidence':detector.result.get('confidence'),
}))
else:
shutil.copyfile(filepath, dst)