1. 程式人生 > >Spark實戰(5) DataFrame基礎之處理缺失值

Spark實戰(5) DataFrame基礎之處理缺失值

Drop Missing Value

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('aggs').getOrCreate()
df = spark.read.csv('sales_info.csv', inferSchema = True, header = True)
df.printSchema()
df.show()

# drop missing data

# drop if any by row
df.na.drop().show()

# at least 2 non-null values will pass
df.na.drop(thresh = 2) # use how parameter df.na.drop(how='all').show() df.na.drop(how='any').show() # based on subset of column df.na.drop(subset=['Sales']).show(

Fill Missing Data

df.na.fill('FILL VALUE').show() # only fill in string type data
df.na.fille(0).show() # only fill in number type data
df.na.fill('No Name', subset = ['Name']).show() # specify the subset # fill the null with mean from pyspark.sql.functions import mean mean_val = df.select(mean(df['Sales'])).collect() mean_sales = mean_val[0][0] # to show the number df.na.fill(mean_sales,['Sales']).show()