"""
SparkToPandas Documentation
-----------------------------
SparkToPandas is a simple plugin alongside of spark, the SparkToPandas was designed to work with pyspark with a syntax more similar to pandas.
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
[docs]class Spark_pandas:
"""
A supporting functions for pyspark ,which has the syntax similar to pandas
"""
def __init__(self, spark):
"""
Accepts the SparkSession as the input
:param spark: class
"""
self.spark = spark
[docs] def read_csv(self, file_location, header=True):
"""
Function to read csv file as a spark rdd
:param file_location: str
:param header: bool
:return: rdd
"""
df = self.spark.read.csv(file_location, header=header)
return df
[docs] def conditional_func(self, x):
"""
A sample function, to add x+1 number
:param x: int
:return: int
"""
return x + 1
[docs] def column_creator(self, df, primary_column, new_column_name, user_func):
"""
Creates a new column based on user defined function and returns the new rdd
:param df: dataframe
:param primary_column: str
:param new_column_name: str
:param user_func: function
:return: dataframe
"""
new_df = df.withColumn(new_column_name, user_func(df[primary_column]))
return new_df
[docs] def head(self, df, n):
"""
Prints the head and tail of the dataframe depending on user's choice.
:param df: dataframe
:param n: int
:return: None
"""
new_df = pd.DataFrame(df.tail(-(n)), columns=df.columns) if n < 0 else pd.DataFrame(df.head(n),
columns=df.columns)
print(new_df)
[docs] def subset_columns(self, column_names, df):
"""
Returns a dataframe which the user specified column names.
:param column_names: list
:param df: dataframe
:return: dataframe
"""
new_df = df.select(*column_names)
return new_df
[docs] def sort_df(self, df, col_name, ascending=True):
"""
Function to sort the dataframe in ascending or descending order based on the columns given
:param df: dataframe
:param col_name: list
:param ascending: bool
:return: dataframe
"""
return df.sort(col_name, ascending=ascending)
[docs] def drop_na(self, df, col_name=None):
"""
Drops null values based on user choice. Supports dropping all null values or dropping null values based on column subset
:param df: dataframe
:param col_name: str
:return: dataframe
"""
if col_name == None:
return df.na.drop()
else:
return df.na.drop(subset=col_name)
[docs] def fillna(self, df, value, col_name=None):
"""
Fills null values based on user choice.
:param df: dataframe
:param value: int/str/float
:param col_name: str
:return: dataframe
"""
if col_name == None:
return df.na.fill(value)
else:
return df.na.fill(value, subset=col_name)
[docs] def barChart(self, df, x, y, hue, title, aspect='horizontal'):
"""
Plots a barchart using the seaborn module
:param df: dataframe
:param x: str
:param y: str
:param hue: str
:param title: str
:param aspect: str
:return: None
"""
df = df.toPandas()
if aspect == "horizontal":
try:
df[y] = df[y].astype(int)
except ValueError:
df[y] = df[y].astype(float)
else:
try:
df[x] = df[x].astype(int)
except ValueError:
df[x] = df[x].astype(float)
sns.catplot(x=x, y=y, hue=hue, data=df, kind="bar")
plt.title(title)
plt.show()