1. 程式人生 > >Python轉碼&解壓&多程序

Python轉碼&解壓&多程序

Python批量轉換檔案編碼格式

Eclipse中看ANSI編碼的檔案有亂碼,所以希望通過python將相關檔案轉換成utf-8編碼。

源:https://www.cnblogs.com/tsbc/p/4450675.html

 

'''

遍歷資料夾

如果檔名是.cpp .h

    如果原來的編碼不是utf-8,將檔案編碼格式改成utf-8

'''

 

import os,sys

import chardet

 

def convert( filename, out_enc="UTF8" ):

    try:

        fp = open(filename,'rb+')

        content = fp.read()

        coding = chardet.detect(content)['encoding'] #獲取encoding的值[編碼格式]

        if coding != 'utf-8':

            new_content = content.decode(coding,"ignore").encode(out_enc)

            fp.seek(0)

            fp.write(new_content)

            fp.close()

    except IOError:

        print( " error")

 

 

def explore(dir):

    #遍歷目錄

    for root, dirs, files in os.walk(dir):

        for file in files:

            if '.cpp' in file or '.h' in file:

                path = os.path.join(root, file)

                convert(path)

 

fiePath = r'E:\Code'

 

def main():

    explore(fiePath)

 

if __name__ == "__main__":

    main()

 

Python解壓

https://www.cnblogs.com/Oliva/p/8824040.html 多執行緒字典破解加密zip

https://www.cnblogs.com/fyqq0403/p/9710420.html 解壓加密的zip

https://www.cnblogs.com/flyhigh1860/p/3884842.html 解壓zip

Python多執行緒&多程序

https://www.cnblogs.com/yeayee/p/4952022.html 基礎介紹

https://www.cnblogs.com/kellyseeme/p/5525017.html 鎖的應用

https://www.cnblogs.com/znicy/p/6234522.html  通過多程序的方式解決了解壓縮的效能問題

https://www.cnblogs.com/xybaby/p/6510941.html#undefined  python效能優化,介紹了GIL導致多執行緒的問題

https://www.cnblogs.com/SuKiWX/p/8804974.html   python GIL解釋

 

python解壓多個壓縮檔案(環境中有6000個左右壓縮檔案)遇到瓶頸,解壓過程非常慢。嘗試用多執行緒解壓,處理時間不僅沒有減少,還增加了。後搜尋上述部落格後,用多程序解壓以縮短處理時間。

import zipfile

import tarfile

import gzip

import os

from time import ctime

from multiprocessing import Pool

from multiprocessing import cpu_count

 

dayZipsPath = r'.'             

quarterZipsPath = r'./tmp'     

zipPassWord = b'password'        

mrFilePath = r'./data'          

 

def unzipDayFile():

    for file_name in os.listdir(dayZipsPath):

        if os.path.splitext(file_name)[1] == '.zip':

            print( file_name)

            file_zip = zipfile.ZipFile(file_name, 'r')

            file_zip.extractall(path = quarterZipsPath, pwd = zipPassWord)

            file_zip.close()

            #os.remove(file_name)

 

def untarDayFile():

    for file_name in os.listdir(dayZipsPath):

        if '.tar.gz' in file_name:

            print( file_name)

            file_tar = tarfile.open(file_name)

            file_tar.extractall(path = quarterZipsPath)

            file_tar.close()

            #os.remove(file_name)

 

def unzip(zipsList):

    for file_name in zipsList:

        if os.path.splitext(file_name)[1] == '.zip':

            zipFileName = quarterZipsPath +'/'+ file_name

            file_zip = zipfile.ZipFile(zipFileName, 'r')

            file_zip.extractall(path = mrFilePath, pwd = zipPassWord)

            file_zip.close()

            os.remove(zipFileName)

 

if __name__ == '__main__':

    print('Begin:%s' % ctime())

    #獲取CPU核個數

    cpuNum = cpu_count()

    print(cpuNum)

    

    unzipDayFile()

    untarDayFile()

    

    #多程序解壓,大大縮短處理時間

    quarterZipsList = list(os.listdir(quarterZipsPath))

    zipFileNum = len(quarterZipsList)

    print("total zip files num:%d" % (zipFileNum))

    print("begin unzip:%s" % ctime())

    p = Pool()

    for i in range(cpuNum):

        beginPos = int(i*zipFileNum/cpuNum)

        endPos = min(int((i+1)*zipFileNum/cpuNum),zipFileNum)

        print("proc %d - %d" % (beginPos, endPos))

        p.apply_async(unzip,args=(quarterZipsList[beginPos:endPos],))

    print("waiting for unzip quarter mr data ...")

    p.close()

    p.join()

    print("end unzip:%s" % ctime())  

    print( "End:%s" % ctime())