当时不太会用 postgresql 在 windows 下面的 powershell 直接导出为 sql 文件,没看里面的内容,最近需要恢复数据发现内容中中文全是乱码,英文是正常的。
尝试了很多方法后,发现文件是 utf16 编码的`Little-endian UTF-16 Unicode text, with very long lines, with CRLF line terminators`
使用下面的命令可以导出为 gbk 编码,部分中文可以还原,但还是有相当多的字符无法还原
```
iconv -f UTF-16 -t GBK < export.sql > out.sql
```
涉及到相关问题的,只有这个文章有所说明
https://www.cnblogs.com/xyb930826/p/4657462.html
尝试了其中 C 代码,仍然是有相同的问题,有些编码无法转换成功
使用 python 重写了代码,实现后,仍然是部分中文无法解码成功
```python
def read_map():
map_value = {}
try:
with open("UnicodeToGBK.txt", "r") as f:
for line in f:
utf_str, gbk_str = line.strip().split()
utf_id = int(utf_str, 16)
gbk_id = int(gbk_str, 16)
map_value[utf_id] = gbk_id
except IOError:
print("Error reading mapping file!")
return None
return map_value
def convert_utf16_to_gbk1(input_file, output_file):
map_value = read_map()
if map_value is None:
print("Convert Failed!")
return
try:
with open(input_file, "rb") as f_in, open(output_file, "wb") as f_out:
# 跳过 BOM
bom = f_in.read(2)
if bom != b'\xff\xfe':
f_in.seek(0)
while True:
ch = f_in.read(1)
cl = f_in.read(1)
if not ch or not cl:
break
ch = ord(ch)
cl = ord(cl)
if ch > 0x7f and cl == 0x00:
ch2 = ord(f_in.read(1))
cl2 = ord(f_in.read(1))
f_out.write(bytes([ch, ch2]))
elif ch <= 0x7f and cl == 0x00:
f_out.write(bytes([ch]))
else:
utf = cl * 256 + ch
gbk = map_value.get(utf, 0)
f_out.write(bytes([gbk // 256, gbk % 256]))
except IOError:
print("Error processing files!")
return
print("Conversion completed successfully!")
def convert_utf16_to_gbk(input_file, output_file):
map_value = read_map()
if map_value is None:
print("Convert Failed!")
return
try:
with open(input_file, "rb") as f_in, open(output_file, "wb") as f_out:
# 跳过 BOM
bom = f_in.read(2)
if bom != b'\xff\xfe':
f_in.seek(0)
while True:
low_byte = f_in.read(1)
high_byte = f_in.read(1)
if not low_byte or not high_byte:
break
low_byte = ord(low_byte)
high_byte = ord(high_byte)
# 正确处理小端序 UTF-16
utf = (high_byte << 8) | low_byte
if utf in map_value:
gbk = map_value[utf]
f_out.write(bytes([gbk // 256, gbk % 256]))
elif utf <= 0x7f:
# ASCII 字符
f_out.write(bytes([utf]))
else:
# 处理未映射的字符
print(f"Unable to convert UTF-16 character: U+{utf:04X}", chr(utf))
# hex_str = f"\\u{utf:04X}"
# f_out.write(hex_str.encode('ascii'))
# f_out.write(bytes.fromhex("E046E160")) # 或者选择其他替代字符
except IOError:
print("Error processing files!")
return
print("Conversion completed successfully!")
if __name__ == "__main__":
convert_utf16_to_gbk("export.sql", "out.sql")
```
比如 UnicodeToGBK.txt 的对应关系如下
```
90C5 DBA4
90C6 E042
90C7 DBA8
90C8 E043
90C9 E044
90CA BDBC
90CB E045
90CC E046
90CD E047
90CE C0C9
90CF DBA3
90D0 DBA6
90D1 D6A3
```
错误的字节
```
Unable to convert UTF-16 character: U+E046
Unable to convert UTF-16 character: U+E160
Unable to convert UTF-16 character: U+E1E2
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E0A5
Unable to convert UTF-16 character: U+E195
Unable to convert UTF-16 character: U+E218
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E1EC
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E1BC
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E11E
Unable to convert UTF-16 character: U+E1C0
Unable to convert UTF-16 character: U+E57D
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E21A
Unable to convert UTF-16 character: U+E6E7
Unable to convert UTF-16 character: U+E11C
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E11C
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E6E7
```
论坛上技术人员较多,有没有遇到过相关问题,不吝赐教 |
|