讀取大檔案資料進入redis作為快取:贈(廣播變數)

阿新 • • 發佈：2018-12-11

在專案中使用Redis做快取檔案(目的等同於廣播變數):

package com.app
import com.utils.{JedisConnectionPool, RptUtils}
import org.apache.commons.lang.StringUtils
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
  * 使用redis存放字典檔案
  */
object AppRpt2 {
  def main(args: Array[String]): Unit = {
    if(args.length != 3){
      println("目錄不存在,請重新輸入")
      sys.exit()
    }
    val Array(inputPath,ouputPath,resultPath) = args
    val conf = new SparkConf().setAppName(s"${this.getClass.getName}").setMaster("local[*]")
          //設定spark序列化方式
          .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val sc = new SparkContext(conf)
    val sQLContext = new SQLContext(sc)
    //配置壓縮格式
    sQLContext.setConf("spark.sql.parquet.compression.codec", "snappy")
    //讀取字典檔案
    val dicMap = sc.textFile(resultPath).map(_.split("\t",-1)).filter(_.length>=5)
      .map(arr=>{
        // com.123.cn 愛奇藝
        (arr(4),arr(1))
      })
    //將字典檔案存到Redis中
    dicMap.foreachPartition(part=>{
      val jedis = JedisConnectionPool.getConnection()
      part.foreach(t=>{
        //存redis   ip  nams
        jedis.set(t._1,t._2)
      })
    })
    
    //讀取parquet檔案
    val df = sQLContext.read.parquet(inputPath)
    df.mapPartitions(maps=>{
      val jedis = JedisConnectionPool.getConnection()
      maps.map(row=> {
        var appname = row.getAs[String]("appname")
        //廣播變數對比  redis快取  從redis快取讀取
        if (!StringUtils.isNotBlank(appname)) {
          //如果取到的值是null   則用他的id去字典表裡得到name
          //appname = broadcast.value.getOrElse("appid","unknow")
          val appid = row.getAs[String]("appid")
          appname = jedis.get(appid)
        }
        
        //把需要的欄位拿出來
        // 原始請求數,有效請求數,廣告請求數
        val requestmode = row.getAs[Int]("requestmode")
        val processnode = row.getAs[Int]("processnode")
        val iseffective = row.getAs[Int]("iseffective")
        // 參與競價數 競價成功數,展示數,點選數
        val isbilling = row.getAs[Int]("isbilling")
        val isbid = row.getAs[Int]("isbid")
        val iswin = row.getAs[Int]("iswin")
        val adorderid = row.getAs[Int]("adorderid")
        // 廣告費用  廣告成本費用
        val winPrice = row.getAs[Double]("winprice")
        val adpayment = row.getAs[Double]("adpayment")
        //呼叫業務的方法
        val reqlist = RptUtils.calculateReq(requestmode, processnode)
        val rtblist = RptUtils.calculateRtb(iseffective, isbilling, isbid, iswin, adorderid, winPrice, adpayment)
        val cliklist = RptUtils.calculateTimes(requestmode, iseffective)
        (appname, reqlist ++ rtblist ++ cliklist)
      })
    }).reduceByKey((list1,list2)=>{
      // list(0,2,1,5) list(2,5,4,7)   zip((0,2),(2,5),(1,4),(5,7))
      list1.zip(list2).map(t=>t._1+t._2)
    }).map(t=>{
      t._1+","+t._2.mkString(",")
    }).take(10).toBuffer.foreach(println)
  }
}

贈:利用廣播變數廣播小檔案:

package com.app

import com.utils.RptUtils
import org.apache.commons.lang.StringUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext

/**
  * 使用廣播變數broadcast廣播小檔案
  */
object AppRpt {
  def main(args: Array[String]): Unit = {
    if (args.length != 3) {
      println("目錄不存在,請重新輸入")
      sys.exit()
    }
    val Array(inputPath, outputPath,resultPath) = args
    val conf = new SparkConf().setAppName(s"${this.getClass.getName}").setMaster("local[*]")
      //搞定第二個需求
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val sc = new SparkContext(conf)
    val sQLContext = new SQLContext(sc)
    //在1.6版本時候預設的壓縮方式還不是snappy,到了2.0之後預設是snappy
    sQLContext.setConf("spark.sql.parquet.compression.codec", "snappy")
    //讀取字典檔案
    val dicMap: Map[String, String] = sc.textFile(resultPath).map(_.split("\t",-1)).filter(_.length>=5)
        .map(arr=>{
          // com.123.cn 愛奇藝
          (arr(4),arr(1))
        }).collect().toMap
    //dicMap.take(100).toBuffer.foreach(println)
    //將小檔案廣播出去
    val broadcast = sc.broadcast(dicMap)

    val df = sQLContext.read.parquet(inputPath)
    df.map(row=>{
      // 如果我們取到的是空值的話,那麼將取字典檔案中進行查詢
      var appname = row.getAs[String]("appname")
      if(!StringUtils.isNotBlank(appname)){
        // 這一塊 做的是通過我們的時間APPId獲取字典檔案中對應的APPID
        // 然後取到它的value
        //com.123.cn 愛奇藝
          appname = broadcast.value.getOrElse(row.getAs[String]("appid"),"unknow")
      }
      //val appname = broadcast.value.getOrElse(row.getAs[String]("appid"),"unknow")
      //先把需要的欄位拿出來,再進行操作
      //處理 原始請求數,有效請求數,廣告請求數
      val requestmode = row.getAs[Int]("requestmode")
      val processnode = row.getAs[Int]("processnode")
      val iseffective = row.getAs[Int]("iseffective")
      //參與競價數,競價成功數,展示數,點選數
      val isbilling = row.getAs[Int]("isbilling")
      val isbid = row.getAs[Int]("isbid")
      val iswin = row.getAs[Int]("iswin")
      val adorderid = row.getAs[Int]("adorderid")
      // 處理 廣告消費,廣告成本
      val winPrice = row.getAs[Double]("winprice")
      val adpayment = row.getAs[Double]("adpayment")

      //呼叫業務的方法
      val reqlist = RptUtils.calculateReq(requestmode,processnode)
      val rtblist = RptUtils.calculateRtb(iseffective,isbilling,isbid,iswin,adorderid,winPrice,adpayment)
      val cliklist = RptUtils.calculateTimes(requestmode,iseffective)
      (appname, reqlist++rtblist++cliklist)
    }).reduceByKey((list1,list2)=>{
      // list(0,2,1,5) list(2,5,4,7)   zip((0,2),(2,5),(1,4),(5,7))
      list1.zip(list2).map(t=>t._1+t._2)
    }).map(t=>{
      t._1+","+t._2.mkString(",")
      }).take(1000).toBuffer.foreach(println)
  }
}

讀取大檔案資料進入redis作為快取:贈(廣播變數)

在專案中使用Redis做快取檔案(目的等同於廣播變數): package com.app import com.utils.{JedisConnectionPool, RptUtils} import org.apache.commons.lang.StringUtils import

python讀取大檔案的方法 python計算檔案的行數和讀取某一行內容的實現方法

python計算檔案的行數和讀取某一行內容的實現方法：最簡單的辦法是把檔案讀入一個大的列表中,然後統計列表的長度.如果檔案的路徑是以引數的形式filepath傳遞的,那麼只用一行程式碼就可以完成我們的需求了: 1、http://blog.csdn.net/shudaq

android讀取csv檔案資料

csv檔案是一種表格形式的檔案，如果把檔案字尾名改為.txt，會發現同一行資料之間是用英文“,”隔開的。如何讀取csv檔案以便把資料存入資料庫呢，特別是csv檔案中有些資料是空？ csv檔案如下：把檔案字尾名改為.txt後如下：電錶id,電錶編號,模組地址,描述,所屬站點名稱,

fread讀取大檔案以及返回值問題(轉載)

今天fread檔案讀取遇到問題，本來很小的一個問題，但是一直沒有注意到，導致花了不少時間除錯，所以寫下來備忘一下。 size_t fread ( void * ptr, size_t size, size_t count, FILE * stream ); /

TensorFlow讀取二進位制檔案資料到佇列

TensorFlow是一種符號程式設計框架（與theano類似），先構建資料流圖再輸入資料進行模型訓練。Tensorflow支援很多種樣例輸入的方式。最容易的是使用placeholder，但這需要手動傳遞numpy.array型別的資料。第二

pandas.read_csv——分塊讀取大檔案

訪問本站觀看效果更佳 read_csv中有個引數chunksize，通過指定一個chunksize分塊大小來讀取檔案，返回的是一個可迭代的物件TextFileReader，IO Tools 舉例如下： In [138]: reader = pd.read_table('

pandas讀取大檔案時memoryerror的解決辦法

再用pd.read_csv讀取大檔案時，如果檔案太大，會出現memoryerror的問題。解決辦法一：pd.read_csv的引數中有一個chunksize引數，為其賦值後，返回一個可迭代物件TextFileReader，對其遍歷即可 reader = pd.read_csv(file_

spring+ mybatis 二級快取使用 redis作為快取

springMybatisConfig.xml配置 <?xml version="1.0" encoding="utf-8"?> <beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="ht

python讀取大檔案和普通檔案

讀取檔案，最常見的方式是： with open('filename', 'r', encoding = 'utf-8') as f: for line in f.readlines(): do_something(line) 但是，當完成這一操作時，readlines()

linux系統從百度網盤中拉大檔案資料

有些深度學習相關的資料集合很有用，而且資料量特別大，為了以後學習使用，通常儲存到百度網盤中。有時候資料來源於比賽網站，不能直接使用wget url獲得資料，可以先把資料儲存到百度網盤，通過離線下載輸入相應的url，進行資料儲存。在linux命令列中，快速的拉取資料變的很有用了，結果自己的經歷，特

C++讀取txt檔案資料

本次實驗主要目的是實現C++提取txt檔案的資料，txt檔案中的資料為double型。 txt檔案的資料為 1.123456789098 2.123456789098 3.123456789098 4.123456789098 5.123456789098 6.123456789098 7

大檔案資料分片上傳簡單示例

文章目錄大檔案資料分片上傳簡單示例 1. HTML 2. JS 3. Java 3.1 檔案屬性實體類 3.2 統一返回結果定義 3.3 Controller

REDIS學習（3.2）spring boot 使用redis作為快取

一，指定主鍵的生成規則在3.1的基礎上修改RedisConfig @Configuration @EnableCaching public class RedisConfig extends CachingConfigurerSupport { @Be

java讀取xml檔案資料

import java.io.File; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import o

伺服器之間如何跨國傳輸大檔案資料？

跨國大檔案傳輸，是各大企業普遍面臨的問題，其中主要突出的矛盾有：怎麼保證跨國傳輸的資料可靠性？怎麼提高跨國傳輸的傳輸效率？這兩個矛盾，要從底層傳輸協議去分析解決：保證傳輸資料的可靠性，首先你使用的傳輸協議需要是可靠的，比如使用經典的TCP協議，面向連結的可靠的位元組流服務，提供超時

Redis作為快取實現工具類

使用Redis作為快取物件,常用的儲存格式為字串,所以在儲存快取時,將物件轉為字串儲存.由於存的時候為字串,所以取出的也為json字串. 此工具類在設值時只需要將key與物件傳入即可取值時只需要將key與要取的物件型別傳入即可 public class CacheUtilImpl im

Python 讀取大檔案的方式

對於讀取容量小的檔案，可以使用下面的方法： with open("path", "r") as f: f.read() 但是如果檔案容量很大，高達幾個G或者十幾個G，使用上面這種方式就容易造成記憶體溢位的問題，所以如果進行大容量的檔案讀取建議使用下面這種方式： with open

使用python來讀取超大型檔案資料

在實際應用中，幾乎所有的資料分析工作都是從資料讀取開始的，如果資料量太大導致資料檔案讀取失敗了，這樣後續的工作就沒有辦法進行了，在機器自身硬體記憶體限制的情況下，當檔案量過大的時候直接使用read等函式來進行操作的時候就會報錯，這裡就需要採取一定的策略來儘可能地避免這樣的

SpringBoot 2.x 使用Redis作為快取設定有效時間

redis 配置 redis: database: 0 host: localhost port: 6379 password: jedis: pool: max-active: 8 max-wait:

java 分次讀取大檔案的三種方法

1. java 讀取大檔案的困難 java 讀取檔案的一般操作是將檔案資料全部讀取到記憶體中，然後再對資料進行操作。例如 Path path = Paths.get("file path"); byte[] data = Files.readAllBytes(path)

讀取大檔案資料進入redis作為快取:贈(廣播變數)

相關推薦