1. 程式人生 > >語料處理之全形轉半形

語料處理之全形轉半形

該指令碼的功能是把文字檔案中的全形字元轉換為半形字元:

# -*- coding: utf-8 -*-  

def strQ2B(inputFilePath,outputFilePath):  
    outputFile = open(outputFilePath,'w')
    with open(inputFilePath) as inputFile:
        lines = inputFile.readlines()
        for line in lines:
            ustring = line.decode('utf-8')
            rstring = ""  
            for uchar in ustring:  
                inside_code=ord(uchar)  
                if inside_code == 12288:                              #全形空格直接轉換              
                    inside_code = 32   
                elif (inside_code >= 65281 and inside_code <= 65374): #全形字元(除空格)根據關係轉化  
                    inside_code -= 65248   
                rstring += unichr(inside_code)
            outputFile.write(rstring.encode('utf-8'))
    outputFile.close()
        
if __name__ == "__main__":
    inputFilePath = "../1.txt"     
    outputFilePath = "../2.txt"
    strQ2B(inputFilePath,outputFilePath)