Python 編碼格式檢測,可以使用 chardet ,
例如:
import urllib
rawdata = urllib.urlopen('http://www.google.cn/').read()
import chardet
print chardet.detect(rawdata) 輸出結果是:
{'confidence': 0.98999999999999999, 'encoding': 'GB2312'}
# win下命令列引數為gbk編碼:star.gbk2unicode(sys.argv[1]) + u'也有'
def gbk2unicode(s):
return s.decode('gbk', 'ignore') # 指令碼檔案#coding:utf-8時預設不帶u的字串為utf8字串:star.utf82unicode('我')
def utf82unicode(s):
return s.decode('utf-8', 'ignore') # 帶u的字串為unicode
# star.unicode2gbk(u'\u4e5f\u6709')
# star.unicode2gbk(u'也有')
def unicode2gbk(s):
return s.encode('gbk') # 帶u的字串為unicode
# star.unicode2utf8(u'\u4e5f\u6709')
# star.unicode2utf8(u'也有')
def unicode2utf8(s):
return s.encode('utf-8') # win下命令列引數為gbk編碼:star.gbk2utf8(sys.argv[1]) + '也有'
def gbk2utf8(s):
return s.decode('gbk', 'ignore').encode('utf-8') def utf82gbk(s):
return s.decode('utf-8', 'ignore').encode('gbk')