pyspark讀取檔案路徑 和 檔案
阿新 • • 發佈:2018-11-04
程式碼:
#!/usr/bin/env python # -*- coding: utf-8 -*- # @author : 何小義 import sys reload(sys) import nerspark sys.setdefaultencoding('utf8') import os import json # 本地spark (ps:要改成讀者的spark路徑) os.environ['SPARK_HOME'] = "/usr/spark-2.0.1" sys.path.append("/usr/spark-2.0.1/python") sys.path.append("/usr/spark-2.0.1/python/bin") try: from pyspark import SparkContext from pyspark import SparkConf from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.sql import DataFrame from pyspark.sql import Row print("Successfully imported Spark Modules") except ImportError as e: print("Can not import Spark Modules", e) sys.exit(1) from pyspark.sql import SparkSession # === spark讀取檔案 ==================================================== # 配置spark(伺服器) # spark = SparkSession.builder.master(伺服器_URL).appName("hzy_test_script").getOrCreate() # 配置spark(本機) spark = SparkSession.builder.master('local').appName("hzy_test_script").getOrCreate() sc = spark.sparkContext # rdd_data = sc.textFile("xxx.t*xt") # 檔案內容 rdd_data = sc.wholetextFile("xxx.t*xt") # 檔案path + 內容 result = rdd_data.take(10) print(result) print('完成...')
注:
1. data_rdd = sc.textFiles('xxxxxxx.txt') # 讀入檔案內容,返回的東西是rdd
2. path_data_rdd = sc.wholeTextFile(('xxxxxxx.txt')) # 不僅讀入檔案內容,還會讀入檔案的路徑path