統計機器學習標註圖片中各個類別的樣本樣以及檢查特殊樣本數量
阿新 • • 發佈:2018-11-01
在進行機器學習或深度學習中,對於那種邊訓練邊增加圖片樣的情況,我們要經常需要獲知目前資料量中樣本的分佈以及處理特殊情況(比如標註框面積小於指定閾值的標註等),為此寫了個簡單程式方面後面使用,特記錄於此.由於程式簡明扼要有些python基礎的童鞋都能看得懂,在此不多說.具體見如下程式碼:
#!/usr/bin/python # -*- coding: UTF-8 -*- #2018/09/12 by DQ import os try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET BoxLenTol=30 BoxAreaTol=BoxLenTol**2 ImSize=[640,480] fileIdLen=6 ImExpName='.jpg' AnotExpName='.xml' ClsNameSet=('blis','cosd','nake','break') AnotFolder='/home/KingMe/project/py-faster-rcnn/data/FABdevkit2017/FAB2017/Annotations'#Annotations' TrainValTestAssignFolder='/home/KingMe/project/py-faster-rcnn/data/FABdevkit2017/FAB2017/ImageSets/Main' TrainValTestFiles={'train':'train.txt','val':'val.txt','test':'test.txt'} ##get object annotation bndbox loc start def GetAnnotBoxLoc(AnotPath): #open xml tree = ET.ElementTree(file=AnotPath) root = tree.getroot() ObjectSet=root.findall('object') ObjBndBoxSet={} for Object in ObjectSet: ObjName=Object.find('name').text BndBox=Object.find('bndbox') x1 = int(BndBox.find('xmin').text)-1 y1 = int(BndBox.find('ymin').text)-1 x2 = int(BndBox.find('xmax').text)-1 y2 = int(BndBox.find('ymax').text)-1 BndBoxLoc=[x1,y1,x2,y2] if ObjBndBoxSet.has_key(ObjName): ObjBndBoxSet[ObjName].append(BndBoxLoc) else: ObjBndBoxSet[ObjName]=[BndBoxLoc]#why not ues dict(key=val)? return ObjBndBoxSet ##get object annotation bndbox loc end def CalSampleNum(BoxSet,BoxNumSet): for Key,Val in BoxSet.iteritems(): if BoxNumSet.has_key(Key): BoxNumSet[Key]=BoxNumSet[Key]+len(Val) #計算標記樣本中小面積樣本數目(我這個是自己標記的可能會存在,標準資料應該不存在) def CalSmallAreaSampleNum(BoxSet,SmallBoxNumSet): for Key,Val in BoxSet.iteritems(): if SmallBoxNumSet.has_key(Key): for Box in Val: X1=Box[0] Y1=Box[1] X2=Box[2] Y2=Box[3] BoxArea=(X2-X1)*(Y2-Y1) if BoxArea<BoxAreaTol: SmallBoxNumSet[Key]=SmallBoxNumSet[Key]+1 ############################################ def GetTotalSampleNum(): AnotFileNum=len(os.listdir(AnotFolder)) TotalSampleNum=dict.fromkeys(ClsNameSet, 0) SmallBoxNumSet=dict.fromkeys(ClsNameSet, 0) MeanSampleNum=dict.fromkeys(ClsNameSet, 0) BigAreaSampleNum=dict.fromkeys(ClsNameSet, 0) ImIdSet=range(1,AnotFileNum+1) for ImId in ImIdSet: ImIdStr=str(ImId).zfill(fileIdLen) AnotName=ImIdStr+AnotExpName AnotPath=os.path.join(AnotFolder,AnotName) AnotBoxSet=GetAnnotBoxLoc(AnotPath) CalSampleNum(AnotBoxSet,TotalSampleNum) CalSmallAreaSampleNum(AnotBoxSet,SmallBoxNumSet) for Key,Val in TotalSampleNum.iteritems(): if MeanSampleNum.has_key(Key): MeanSampleNum[Key]=round(Val*1.0/AnotFileNum,2) for Key,Val in TotalSampleNum.iteritems(): if BigAreaSampleNum.has_key(Key): BigAreaSampleNum[Key]=TotalSampleNum[Key]-SmallBoxNumSet[Key] print 'ImNum='+str(AnotFileNum) print 'TotalSampleNum='+str(TotalSampleNum) print 'MeanSampleNum='+str(MeanSampleNum) print 'BoxAreaTol='+str(BoxLenTol)+'*'+str(BoxLenTol) print 'SmallAreaSampleNum='+str(SmallBoxNumSet) print 'BigAreaSampleNum='+str(BigAreaSampleNum) def GetTrainValTestSample(SampleNumSet,ImIdFilePath): with open(ImIdFilePath,'r') as FId: k=0 TxtList=FId.readlines() for LineStr in TxtList: PureStr=LineStr.strip() AnotFileName=PureStr+AnotExpName AnotFilePath=os.path.join(AnotFolder,AnotFileName) AnotBoxSet=GetAnnotBoxLoc(AnotFilePath) CalSampleNum(AnotBoxSet,SampleNumSet) k=k+1 FileName=os.path.basename(ImIdFilePath) print FileName +' ImageNum='+str(k)+';', def GetTrainValTestSampleMain(): for Key,FileName in TrainValTestFiles.iteritems(): ImIdFilePath=os.path.join(TrainValTestAssignFolder,FileName) SampleNumSet=dict.fromkeys(ClsNameSet, 0) GetTrainValTestSample(SampleNumSet,ImIdFilePath) print FileName[:-4]+'SampleNumSet='+str(SampleNumSet) GetTotalSampleNum() GetTrainValTestSampleMain()