1. 程式人生 > >pyspark讀取檔案路徑 和 檔案

pyspark讀取檔案路徑 和 檔案

程式碼:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @author  : 何小義

import sys
reload(sys)
import nerspark
sys.setdefaultencoding('utf8')
import os
import json

# 本地spark (ps:要改成讀者的spark路徑)
os.environ['SPARK_HOME'] = "/usr/spark-2.0.1"
sys.path.append("/usr/spark-2.0.1/python")
sys.path.append("/usr/spark-2.0.1/python/bin")

try:
    from pyspark import SparkContext
    from pyspark import SparkConf
    from pyspark.sql import SparkSession
    from pyspark.sql import SQLContext
    from pyspark.sql import DataFrame
    from pyspark.sql import Row
    print("Successfully imported Spark Modules")
except ImportError as e:
    print("Can not import Spark Modules", e)
    sys.exit(1)
from pyspark.sql import SparkSession

# === spark讀取檔案 ====================================================

    # 配置spark(伺服器)
    # spark = SparkSession.builder.master(伺服器_URL).appName("hzy_test_script").getOrCreate()
    # 配置spark(本機)
    spark = SparkSession.builder.master('local').appName("hzy_test_script").getOrCreate()
    sc = spark.sparkContext

    # rdd_data = sc.textFile("xxx.t*xt")  # 檔案內容
    rdd_data = sc.wholetextFile("xxx.t*xt") # 檔案path + 內容
    result = rdd_data.take(10)
    print(result)

    print('完成...')

注:

1. data_rdd = sc.textFiles('xxxxxxx.txt')  # 讀入檔案內容,返回的東西是rdd

2. path_data_rdd = sc.wholeTextFile(('xxxxxxx.txt'))  # 不僅讀入檔案內容,還會讀入檔案的路徑path