1. 程式人生 > >第79課:Spark SQL基於網站Log的綜合案例綜合程式碼和實際執行測試

第79課:Spark SQL基於網站Log的綜合案例綜合程式碼和實際執行測試

內容:

    1.熱門論壇板塊排名
    2.綜合程式碼實戰和測試

一、熱門論壇板塊排版

建立表:    

spark.sql("createtable userlogs(date string, timestamp bigint, userID bigint, pageID bigint, channel string, action string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'")

匯入資料

    load data local inpath '/home/hadoop/learnSpark/SparkSQLDataManually/userLogs.log' into table row format delimited fields terminated by '\t' lines terminated by '\n';

二、綜合程式碼實戰和測試

package SparkSQL;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.hive.HiveContext;

import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

/**
 * FileName: SparkSQLUserLogsOps
 * Author:   hadoop
 * Email:    
[email protected]
* Date: 18-11-12 下午10:19 * Description: * Table in hive database creation * * sqlContext.sql("create table userlogs(date string, timestamp bigint, userID bigint, pageID bigint, channel string, action string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'") */ public class SparkSQLUserLogsOps { public static void main(String[] args){ //建立SparkConf用於讀取系統資訊並設定運用程式的名稱 SparkConf conf = new SparkConf().setAppName("SparkSQLUserLogsOps").setMaster("spark://Master:7077"); //建立JavaSparkContext物件例項作為整個Driver的核心基石 JavaSparkContext sc = new JavaSparkContext(conf); //設定輸出log的等級 sc.setLogLevel("INFO"); //建立SQLContext上下文物件,用於SqL的分析 HiveContext hiveContext = new HiveContext(sc.sc()); String yesterday = getTwodaysago(); pvStatistic(hiveContext,yesterday); //pv hotChannel(hiveContext,yesterday);//熱門板塊 uvStatistic(hiveContext,yesterday); //uv jumpOutStatistic(hiveContext,yesterday);//跳出頁面 newUserRegisterPercentStatistic(hiveContext, yesterday); //新使用者註冊的比例 } private static void newUserRegisterPercentStatistic(HiveContext hiveContext, String yesterday) { hiveContext.sql("use hive"); String newUserSQL = "select count(*) " + "from userlogs " + "where action = 'View' and date='"+ yesterday+"' and userID is NULL " ; String RegisterUserSQL = "SELECT count(*) " + "from userlogs" + "where action = 'Register' and date='"+ yesterday+"' " ; Object newUser = hiveContext.sql(newUserSQL).collect()[0].get(0); Object RegisterUser = hiveContext.sql(RegisterUserSQL).collect()[0].get(0); double total = Double.valueOf(newUser.toString()); double register = Double.valueOf(RegisterUser.toString()); System.out.println("模擬新使用者註冊比例:" + register / total); } private static void jumpOutStatistic(HiveContext hiveContext, String yesterday) { hiveContext.sql("use hive"); String totalPvSQL = "select count(*) " + "from " + "userlogs " + "where action = 'View' and date='"+ yesterday+"' " ; String pv2OneSQL = "SELECT count(*) " + "from " + "(SELECT count(*) totalNumber from userlogs " + "where action = 'View' and date='"+ yesterday+"' " + "group by userID " + "having totalNumber = 1) subquery " ; Object totalPv = hiveContext.sql(totalPvSQL).collect()[0].get(0); Object pv2One = hiveContext.sql(pv2OneSQL).collect()[0].get(0); double total = Double.valueOf(totalPv.toString()); double pv21 = Double.valueOf(pv2One.toString()); System.out.println("跳出率為" + pv21 / total); } private static void uvStatistic(HiveContext hiveContext, String yesterday){ hiveContext.sql("use hive"); String sqlText = "select date, pageID, uv " + "from " + "(select date, pageID, count(distinct(userID)) uv from userlogs " + "where action = 'View' and date='"+ yesterday+"' " + "group by date, pageID) subquery " + "order by uv desc " ; hiveContext.sql(sqlText).show(); } private static void hotChannel(HiveContext hiveContext, String yesterday) { hiveContext.sql("use hive"); String sqlText = "select date, pageID, pv " + "from " + "(select date, pageID, count(1) pv from userlogs " + "where action = 'View' and date='"+ yesterday+"' " + "group by date, pageID) subquery " + "order by pv desc " ; hiveContext.sql(sqlText).show(); } private static void pvStatistic(HiveContext hiveContext, String twodaysago) { hiveContext.sql("use hive"); String sqlText = "select date,pageID,pv " +" from (select date,pageID,count(*) pv from userlogs " + "where action = 'View' and date = '"+twodaysago+"' group by date,pageID ) subqurey order by pv desc limit 10"; hiveContext.sql(sqlText).show(); } private static String getTwodaysago() { SimpleDateFormat date = new SimpleDateFormat("yyyy-MM-dd"); Calendar calender = Calendar.getInstance(); calender.setTime(new Date()); calender.add(Calendar.DATE,-2); Date yesterday = calender.getTime(); return date.format(yesterday); } }