1. 程式人生 > >大資料系列之實時計算Spark(十八)Python生成圖表

大資料系列之實時計算Spark(十八)Python生成圖表

1.啟動pyspark
pyspark --master local[2]                                                                                                 

2.編寫程式碼
#匯入sql
from pyspark.sql import Row
import matplotlib.pyplot as plt
import numpy as np
import pylab as P
plt.rcdefaults()
dataDir ="file:///home/zpx/ml-data/ml-1m/users.dat"
lines = sc.textFile(dataDir)
splitLines = lines.map(lambda l: l.split("::"))
usersRDD = splitLines.map(lambda p: Row(id=p[0],gender=p[1],age=int(p[2]), occupation=p[3], zipcode=p[4]))
usersDF = spark.createDataFrame(usersRDD)
usersDF.createOrReplaceTempView("users")
usersDF.show()


#生成直方圖
ageDF = spark.sql("SELECT age FROM users")
ageList = ageDF.rdd.map(lambda p: p.age).collect()
ageDF.describe().show()

plt.hist(ageList)
plt.title("Age distribution of the users\n")
plt.xlabel("Age")
plt.ylabel("Number of users")
plt.show(block=False)

#密度圖
from scipy.stats import gaussian_kde
density = gaussian_kde(ageList)
xAxisValues = np.linspace(0,100,1000)
density.covariance_factor = lambda : .5
density._compute_covariance()
plt.title("Age density plot of the users\n")
plt.xlabel("Age")
plt.ylabel("Density")
plt.plot(xAxisValues, density(xAxisValues))
plt.show(block=False)

#生成巢狀子圖
plt.subplot(121)
plt.hist(ageList)
plt.title("Age distribution of the users\n")
plt.xlabel("Age")
plt.ylabel("Number of users")
plt.subplot(122)
plt.title("Summary of distribution\n")
plt.xlabel("Age")
plt.boxplot(ageList, vert=False)
plt.show(block=False)

#柱狀圖
occ10 = spark.sql("SELECT occupation, count(occupation) as usercount FROM users GROUP BY occupation ORDER BY usercount DESC LIMIT 10")
occ10.show()

occTuple = occ10.rdd.map(lambda p:(p.occupation,p.usercount)).collect()
occList, countList = zip(*occTuple)
occList

y_pos = np.arange(len(occList))
plt.barh(y_pos, countList, align='center', alpha=0.4)
plt.yticks(y_pos, occList)
plt.xlabel('Number of users')
plt.title('Top 10 user types\n')
plt.gcf().subplots_adjust(left=0.15)
plt.show(block=False)


#堆疊條形圖
occGender = spark.sql("SELECT occupation, gender FROM users")
occGender.show()

occCrossTab = occGender.stat.crosstab("occupation","gender")
occupationsCrossTuple = occCrossTab.rdd.map(lambda p:(p.occupation_gender,p.M, p.F)).collect()
occList, mList, fList = zip(*occupationsCrossTuple)
N = len(occList)
ind = np.arange(N)
width = 0.75
p1 = plt.bar(ind, mList, width, color='r')
p2 = plt.bar(ind, fList, width, color='y', bottom=mList)
plt.ylabel('Count')
plt.title('Gender distribution by occupation\n')
plt.xticks(ind + width/2., occList, rotation=90)
plt.legend((p1[0], p2[0]), ('Male', 'Female'))
plt.gcf().subplots_adjust(bottom=0.25)
plt.show(block=False)

#餅圖
occupationsBottom10 = spark.sql("SELECT occupation,count(occupation) as usercount FROM users GROUP BY occupation ORDER BY usercount LIMIT 10")
occupationsBottom10Tuple = occupationsBottom10.rdd.map(lambda p:(p.occupation,p.usercount)).collect()
occupationsBottom10List, countBottom10List =zip(*occupationsBottom10Tuple)
explode = (0, 0.3, 0.2, 0.15,0.1,0,0,0,0,0.1)
plt.pie(countBottom10List, explode=explode,labels=occupationsBottom10List, autopct='%1.1f%%', shadow=True,startangle=90)
plt.title('Bottom 10 user types\n')
plt.show(block=False)

所用資料集下載:

連結:https://pan.baidu.com/s/1vUTt2GvPtlsNfqcWvo3lIA 密碼:izpa