1. 程式人生 > >統計機器學習標註圖片中各個類別的樣本樣以及檢查特殊樣本數量

統計機器學習標註圖片中各個類別的樣本樣以及檢查特殊樣本數量

       在進行機器學習或深度學習中,對於那種邊訓練邊增加圖片樣的情況,我們要經常需要獲知目前資料量中樣本的分佈以及處理特殊情況(比如標註框面積小於指定閾值的標註等),為此寫了個簡單程式方面後面使用,特記錄於此.由於程式簡明扼要有些python基礎的童鞋都能看得懂,在此不多說.具體見如下程式碼:

#!/usr/bin/python
# -*- coding: UTF-8 -*-
#2018/09/12 by DQ
import os
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

BoxLenTol=30
BoxAreaTol=BoxLenTol**2											
ImSize=[640,480]
fileIdLen=6
ImExpName='.jpg'
AnotExpName='.xml'
ClsNameSet=('blis','cosd','nake','break')
AnotFolder='/home/KingMe/project/py-faster-rcnn/data/FABdevkit2017/FAB2017/Annotations'#Annotations' 
TrainValTestAssignFolder='/home/KingMe/project/py-faster-rcnn/data/FABdevkit2017/FAB2017/ImageSets/Main'
TrainValTestFiles={'train':'train.txt','val':'val.txt','test':'test.txt'}             

##get object annotation bndbox loc start 
def GetAnnotBoxLoc(AnotPath):
    #open xml 
    tree = ET.ElementTree(file=AnotPath)
    root = tree.getroot()
    ObjectSet=root.findall('object')
    ObjBndBoxSet={}
    for Object in ObjectSet:
        ObjName=Object.find('name').text
        BndBox=Object.find('bndbox')
        x1 = int(BndBox.find('xmin').text)-1
        y1 = int(BndBox.find('ymin').text)-1
        x2 = int(BndBox.find('xmax').text)-1
        y2 = int(BndBox.find('ymax').text)-1
        BndBoxLoc=[x1,y1,x2,y2]
        if ObjBndBoxSet.has_key(ObjName):
        	ObjBndBoxSet[ObjName].append(BndBoxLoc)
        else:
        	ObjBndBoxSet[ObjName]=[BndBoxLoc]#why not ues dict(key=val)?
    return ObjBndBoxSet
##get object annotation bndbox loc end


def CalSampleNum(BoxSet,BoxNumSet):
	for Key,Val in BoxSet.iteritems():
		if BoxNumSet.has_key(Key):
			BoxNumSet[Key]=BoxNumSet[Key]+len(Val)

#計算標記樣本中小面積樣本數目(我這個是自己標記的可能會存在,標準資料應該不存在)
def CalSmallAreaSampleNum(BoxSet,SmallBoxNumSet):
    for Key,Val in BoxSet.iteritems():
        if SmallBoxNumSet.has_key(Key):
            for Box in Val:
                X1=Box[0]
                Y1=Box[1]
                X2=Box[2]
                Y2=Box[3]
                BoxArea=(X2-X1)*(Y2-Y1)
                if BoxArea<BoxAreaTol:
                   SmallBoxNumSet[Key]=SmallBoxNumSet[Key]+1



############################################ 
def GetTotalSampleNum():
    AnotFileNum=len(os.listdir(AnotFolder))
    TotalSampleNum=dict.fromkeys(ClsNameSet, 0)
    SmallBoxNumSet=dict.fromkeys(ClsNameSet, 0)
    MeanSampleNum=dict.fromkeys(ClsNameSet, 0) 
    BigAreaSampleNum=dict.fromkeys(ClsNameSet, 0) 

    ImIdSet=range(1,AnotFileNum+1)
    for ImId in ImIdSet:
    	ImIdStr=str(ImId).zfill(fileIdLen)
    	AnotName=ImIdStr+AnotExpName
    	AnotPath=os.path.join(AnotFolder,AnotName)
    	AnotBoxSet=GetAnnotBoxLoc(AnotPath)
    	CalSampleNum(AnotBoxSet,TotalSampleNum)
        CalSmallAreaSampleNum(AnotBoxSet,SmallBoxNumSet)

    for Key,Val in TotalSampleNum.iteritems():
        if MeanSampleNum.has_key(Key):
            MeanSampleNum[Key]=round(Val*1.0/AnotFileNum,2)
    for Key,Val in TotalSampleNum.iteritems():
        if BigAreaSampleNum.has_key(Key):
            BigAreaSampleNum[Key]=TotalSampleNum[Key]-SmallBoxNumSet[Key]

    print 'ImNum='+str(AnotFileNum)
    print 'TotalSampleNum='+str(TotalSampleNum)
    print 'MeanSampleNum='+str(MeanSampleNum)
    print 'BoxAreaTol='+str(BoxLenTol)+'*'+str(BoxLenTol)
    print 'SmallAreaSampleNum='+str(SmallBoxNumSet)    
    print 'BigAreaSampleNum='+str(BigAreaSampleNum)



def GetTrainValTestSample(SampleNumSet,ImIdFilePath):
    with open(ImIdFilePath,'r') as FId:
        k=0
        TxtList=FId.readlines()
        for LineStr in TxtList:
            PureStr=LineStr.strip()
            AnotFileName=PureStr+AnotExpName
            AnotFilePath=os.path.join(AnotFolder,AnotFileName)
            AnotBoxSet=GetAnnotBoxLoc(AnotFilePath)
            CalSampleNum(AnotBoxSet,SampleNumSet)
            k=k+1
        FileName=os.path.basename(ImIdFilePath)
        print FileName +' ImageNum='+str(k)+';',


def GetTrainValTestSampleMain():
    for Key,FileName in TrainValTestFiles.iteritems():
        ImIdFilePath=os.path.join(TrainValTestAssignFolder,FileName)
        SampleNumSet=dict.fromkeys(ClsNameSet, 0)
        GetTrainValTestSample(SampleNumSet,ImIdFilePath)
        print FileName[:-4]+'SampleNumSet='+str(SampleNumSet)


GetTotalSampleNum()
GetTrainValTestSampleMain()