文件中八进制转中文

有的日志文件中的中文竟然是八进制,形如:

name:"\344\270\252\344\272\272\350\265\204\346\226\231\345\215\241"

为此写了个Python脚本通过引号识别将整个文本中的八进制转换成中文

环境

  • Python3

Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import chardet
import re
import sys
import urllib.parse


# 返回content中从当前index开始的下两个引号"的index
def get_next_two_quota(cur_index: int, content):
quota1 = content.find("\"", cur_index)
quota2 = content.find("\"", quota1 + 1)
return quota1, quota2


# 把八进制转中文
def change_coding(s):
p = re.compile(r'(?P<s>(\\\d\d\d){3,})')
for i in p.finditer(s):
old = i.group('s')
name = old.split('\\')
name = ['%x' % int(g, 8) for g in name if g.isdigit()]
name = '%' + '%'.join(name)
cn_name = urllib.parse.unquote(name, 'UTF-8')
return cn_name
return s


if __name__ == '__main__':
with open("file") as f:
content = f.read()
quota1, quota2 = get_next_two_quota(0, content)
cn_content = content[0:quota1 + 1]
while quota1 != -1 and quota2 != -1:
# 替换两个引号中间的内容
sub = content[quota1 + 1:quota2]
cn_sub = change_coding(sub)
# print(cn_sub)
cn_content = cn_content + cn_sub
next_quota1, next_quota2 = get_next_two_quota(quota2 + 1, content)
if next_quota1 == -1 or next_quota2 == -1:
break
cn_content += content[quota2: next_quota1 + 1]
quota1 = next_quota1
quota2 = next_quota2
# print(quota1, quota2)
cn_content += content[quota2:]
print(cn_content)