文件中八进制转中文
有的日志文件中的中文竟然是八进制,形如:
name:"\344\270\252\344\272\272\350\265\204\346\226\231\345\215\241"
为此写了个Python脚本通过引号识别将整个文本中的八进制转换成中文
环境
Code 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 import chardetimport reimport sysimport urllib.parsedef get_next_two_quota (cur_index: int , content ): quota1 = content.find("\"" , cur_index) quota2 = content.find("\"" , quota1 + 1 ) return quota1, quota2 def change_coding (s ): p = re.compile (r'(?P<s>(\\\d\d\d){3,})' ) for i in p.finditer(s): old = i.group('s' ) name = old.split('\\' ) name = ['%x' % int (g, 8 ) for g in name if g.isdigit()] name = '%' + '%' .join(name) cn_name = urllib.parse.unquote(name, 'UTF-8' ) return cn_name return s if __name__ == '__main__' : with open ("file" ) as f: content = f.read() quota1, quota2 = get_next_two_quota(0 , content) cn_content = content[0 :quota1 + 1 ] while quota1 != -1 and quota2 != -1 : sub = content[quota1 + 1 :quota2] cn_sub = change_coding(sub) cn_content = cn_content + cn_sub next_quota1, next_quota2 = get_next_two_quota(quota2 + 1 , content) if next_quota1 == -1 or next_quota2 == -1 : break cn_content += content[quota2: next_quota1 + 1 ] quota1 = next_quota1 quota2 = next_quota2 cn_content += content[quota2:] print (cn_content)