1. 程式人生 > >Python聚類分析死囚的最後遺言問題

Python聚類分析死囚的最後遺言問題

聚類是無監督學習的一個例子,具體的定義百度一下吧!直接進入主題,先說明一下資料的問題,該資料是我重一家外國網站收集的關係死囚的臨行前的一些最後遺言,以及死囚的一些個人資料,僅供參看。
先說明一下怎樣爬取資料吧!該案例使用urllib2,bs4,SGMLParser庫中知識,其中urllib2用於爬取資料,bs4和SGMLParser用於解析資料,並儲存到檔案中去。具體的直接看程式碼吧!
# coding=utf-8

import urllib2
from bs4 import BeautifulSoup
from sgmllib import SGMLParser


class
FirstParser(SGMLParser):
def __init__(self): SGMLParser.__init__(self) self.__start_tbody = False self.__start_tr = False self.__start_td = False self.__start_th = False self.__start_a = False self.__td_state = 0 self.__tr_value = [] self.data = [] def
start_tbody(self, attr):
self.__start_tbody = True def end_tbody(self): self.__start_tbody = False def start_tr(self, attrs): if self.__start_tbody: self.__start_tr = True def end_tr(self): if self.__start_tbody and self.__start_tr: self.data.append(self.__tr_value) self.__tr_value = [] self.__start_tr = False
def start_th(self, attrs): if self.__start_tbody and self.__start_tr: self.__start_th = True def end_th(self): if self.__start_tbody and self.__start_tr and self.__start_th: self.__start_th = False def start_td(self, attrs): if self.__start_tbody and self.__start_tr: self.__start_td = True self.__td_state += 1 def end_td(self): if self.__start_tbody and self.__start_tr and self.__start_td: self.__start_td = False self.__td_state = 0 def start_a(self, attrs): if self.__start_tbody and self.__start_tr: self.__tr_value.append(attrs[0][1]) # print attrs self.__start_a = True def end_a(self): if self.__start_tbody and self.__start_tr and self.__start_td: self.__start_a = False def handle_data(self, data): if self.__start_tbody and self.__start_tr and \ (self.__start_td or self.__start_th): if self.__start_th: self.__tr_value.append(data) if self.__start_td: # if self.__td_state != 2 or self.__td_state != 3: self.__tr_value.append(data) def read_first(page): soup = BeautifulSoup(page, 'lxml') value = [] for row in soup.find_all('tbody'): tbody = row.find_all('tr') print len(tbody) for index, r in enumerate(tbody): t = [] if index == 0: for k in r.find_all('th'): t.append(k.string) else: for k in r.find_all('td'): t.append(k.string) value.append(t) return value def download_second(url): url = 'http://www.tdcj.state.tx.us/death_row/' + url page = urllib2.urlopen(url).read() page = page.replace('<br />', '') soup = BeautifulSoup(page, 'lxml') vl = [] v2 = [] for row in soup.find('table').find_all('tr'): td = row.find_all('td') vl.append(fun_replace(td[len(td) - 1].string)) p = soup.find_all('p') for row in p[1:]: temp = [] if len(row.find_all('span')) > 0: # temp.append(fun_replace(str(row.find_all('span')[0].string))) try: temp.append(fun_replace(str(row.text.split('\r\n')[1].strip()))) except: temp.append('') else: # temp.append(row.string) temp.append('') v2.append(temp) return [vl, v2] def download_three(url): url = 'http://www.tdcj.state.tx.us/death_row/' + url page = urllib2.urlopen(url).read() soup = BeautifulSoup(page, 'lxml') p = soup.find_all('p') v1 = [] if len(p) >= 6: for index, row in enumerate(p): if index % 2 == 1: v1.append([fun_replace(p[index].string), fun_replace(p[index + 1].string)]) if index >= 5: break return v1 def fun_replace(s): return s.replace(',', '.') if s is not None else '' def down_first(): url = 'http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html' # page = urllib2.urlopen(url).read() page = open('first.html').read() first = FirstParser() first.feed(page) value = first.data with open('first.txt', 'a+') as f: for index, row in enumerate(value): print row[0] if index == 0: continue value = 'Execution,Name,TDCJ Number,Date of Birth,Date Received,' \ 'Age (when Received),Education Level (Highest Grade Completed),' \ 'Date of Offense,Age (at the time of Offense),County,Race,Gender,' \ 'Hair Color,Height,Weight,Eye Color,Native County,Native State,' \ 'Prior Occupation,Prior Prison Record,Summary of Incident,Co-Defendants,' \ 'Race and Gender of Victim,Date of Execution,Offender,Last Statement,' \ 'Last Name,First Name,Race,County\n' f.write(value) else: try: se = download_second(row[1]) th = download_three(row[3]) value = row[0] + ',' + se[0][0] + ',' + se[0][1] + ',' + se[0][2] + ',' + se[0][3] \ + ',' + se[0][4] + ',' + se[0][5] \ + ',' + se[0][6] + ',' + se[0][7] + ',' + se[0][8] + ',' + se[0][9] + ',' + se[0][10] \ + ',' + se[0][11] + ',' + se[0][12] + ',' + se[0][13] + ',' + se[0][14] \ + ',' + se[0][15] + ',' + se[0][16] + ',' + se[1][0][0] + ',' + se[1][1][0] + ',' \ + se[1][2][0] + ',' + se[1][3][0] + ',' + se[1][4][0] + ',' + th[0][1] + ',' + th[1][0] \ + ',' + th[2][0] + ',' + row[3] + ',' + row[4] + ',' + row[-2] + ',' + row[-1] + '\n' f.write(value.encode('utf-8')) except BaseException as e: print e # break down_first()

這裡不建議直接通過案例直接去爬取資料,資料中有些坑,好多資料是以圖片的形式展現的,沒有辦法獲取到的!可以直接下載案例中的資料去使用。

下面開市進行聚類分析

# coding=utf-8

import math
import random
import re


def height(s):
    if s.find('\'') != -1:
        t = s.replace('"', '').split('\'')
    elif s.find('ft') != -1:
        t = s.replace('.', '').replace('in', '').split('ft')
    elif s.find('-') != -1:
        t = s.split('-')
    elif len(s.strip()) == 0:
        t = ['5', '11']
    else:
        t = ['5', '11']
    v = [float(t[0].strip()), float(t[1].strip() if len(t[1].strip()) != 0 else '0')]
    return round((12 * v[0] + v[1]) * 30.48 / 12, 2)


def grade(s):
    p = re.match(r'\d+', s)
    if p is None:
        return 12
    else:
        sp = p.span()
        return int(s[sp[0]:sp[1]])


def load_dataset():
    dataSet = []
    labels = []
    titles = []
    with open('first.txt', 'r+') as f:
        for index, row in enumerate(f.readlines()):
            if index == 0:
                titles = [row for row in row.strip().split(',')]
            else:
                t = [row for row in row.strip().split(',')]
                dataSet.append([int(t[5]), grade(t[6]), height(t[13]), float(t[14].replace('lbs.', ''))])
                labels.append([row.strip() for index, row in enumerate(t) if index not in (14, 13, 6, 5)])
    return dataSet, labels, titles


def pearson(v1, v2):
    '''
    計算皮爾相關度
    :param v1:
    :param v2:
    :return:
    '''
    sum1 = sum(v1)
    sum2 = sum(v2)

    sum1Sq = sum([pow(x, 2) for x in v1])
    sum2Sq = sum([pow(x, 2) for x in v2])

    psum = sum([v1[index] * v2[index] for index in range(len(v1))])

    # 計算r
    num = psum - (sum1 * sum2 / len(v1))
    den = math.sqrt((sum1Sq - pow(sum1, 2) / len(v1)) * (sum2Sq - pow(sum2, 2) / len(v1)))
    if den == 0: return 0
    return 1.0 - num / den


def euclidean(v1, v2):
    '''
    歐幾里得距離
    :param v1:
    :param v2:
    :return:
    '''
    return math.sqrt(sum([pow(v1[i] - v2[i], 2) for i in range(len(v1))]))


class bicluster:
    def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
        self.left = left
        self.right = right
        self.vec = vec
        self.id = id
        self.distance = distance


def hcluster(rows, distance=pearson):
    '''
    簡單分類
    :param rows:
    :param distance:
    :return:
    '''
    distances = {}
    currentclustid = -1
    clust = [bicluster(rows[i], id=i) for i in range(len(rows))]

    while len(clust) > 1:
        lowestpair = (0, 1)
        closest = distance(clust[0].vec, clust[1].vec)

        for i in range(len(clust)):
            for j in range(i + 1, len(clust)):
                if (clust[i].id, clust[j].id) not in distances:
                    distances[(clust[i].id, clust[j].id)] = \
                        distance(clust[i].vec, clust[j].vec)
                d = distances[(clust[i].id, clust[j].id)]
                if d < closest:
                    closest = d
                    lowestpair = (i, j)
        mergevec = [
            (clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i]) / 2.0
            for i in range(len(clust[0].vec))]
        newcluster = bicluster(mergevec, left=clust[lowestpair[0]],
                               right=clust[lowestpair[1]],
                               distance=closest, id=currentclustid)
        currentclustid -= 1
        del clust[lowestpair[1]]
        del clust[lowestpair[0]]
        clust.append(newcluster)
    return clust[0]


def find(clust, labels, data, distance=pearson):
    '''
    查詢最合適的結果
    :param clust:
    :param labels:
    :param data:
    :param distance:
    :return:
    '''
    while True:
        left = clust.left
        right = clust.right
        if left is None and right is None:
            return labels[clust.id]
        else:
            if left is None and right is not None:
                clust = left
                continue
            elif left is not None and right is None:
                clust = right
                continue
            else:
                ls = distance(left.vec, data)
                rs = distance(right.vec, data)
                if ls <= rs:
                    clust = left
                    continue
                else:
                    clust = right
                    continue


def kcluster(rows, distance=pearson, k=4):
    '''
    K-均值聚類
    :param rows:
    :param distance:
    :param k:
    :return:
    '''
    ranges = [(min([row[i] for row in rows]), max([row[i] for row in rows])) \
              for i in range(len(rows[0]))]
    clusters = [[random.random() * (ranges[i][1] - ranges[i][0]) + ranges[i][0] \
                 for i in range(len(rows[0]))] for j in range(k)]

    lastmatches = None
    for t in range(100):
        # print 'Iteration %d' % t
        bestmatches = [[] for i in range(k)]

        for j in range(len(rows)):
            row = rows[j]
            bestmatch = 0
            for i in range(k):
                d = distance(clusters[i], row)
                if d < distance(clusters[bestmatch], row): bestmatch = i
            bestmatches[bestmatch].append(j)
        if bestmatches == lastmatches: break
        lastmatches = bestmatches

        for i in range(k):
            avgs = [0.0] * len(rows[0])
            if len(bestmatches[i]) > 0:
                for rowid in bestmatches[i]:
                    for m in range(len(rows[rowid])):
                        avgs[m] += rows[rowid][m]
                for j in range(len(avgs)):
                    avgs[j] /= len(bestmatches[i])
                clusters[i] = avgs

    return bestmatches, clusters


def find_k(bestmatches, clusters, dataSet, labels, data, distance=pearson):
    best = -1
    best_value = 0
    for i in range(len(clusters)):
        t1 = distance(clusters[i], data)
        if t1 <= best_value:
            best = i
            best_value = t1
    best1 = -1
    best_value1 = 0
    for i, row in enumerate(bestmatches[best]):
        t1 = distance(dataSet[row], data)
        if t1 <= best_value1:
            best1 = row
            best_value1 = t1
    return labels[best1], dataSet[best1]

具體不太瞭解的看註釋,其中有兩套方法,分類依據有皮爾遜相關係數和歐幾里得距離,有分級聚類和k-均值聚類等,函式以引數的形式進行傳遞,有利於以後的擴充套件。
下面是測試程式

# coding=utf-8

from analysis import *
from show import *

dataSet, labels, titles = load_dataset()
#測試分級聚類,使用皮爾遜相關係數
clust = hcluster(dataSet, distance=pearson)
result = find(clust, labels, [26, 16, 176, 160], distance=pearson)
print result
#測試分級聚類,使用歐幾里得距離
# clust = hcluster(dataSet, distance=euclidean)
# result = find(clust, labels, [26, 16, 176, 160], distance=euclidean)
# print result

#把分局結果進行分級顯示
# printclust(clust)

#使用k-均值聚類,皮爾遜歐幾里得係數
# bestmatches, clusters = kcluster(dataSet, pearson, 4)
# result = find_k(bestmatches, clusters, dataSet, labels, [26, 16, 176, 160], pearson)
# print result

本人菜鳥一枚,僅供共同學習使用,還請大神多多指導。接下來會寫用KNN進行分類的方法。

下載地址