# PySpark - Data Frame
!pip install pyspark
pip install findspark
import findspark
# Use pandas to import csv
import pandas as pd
type(pd.read_csv('Assets.csv'))
import os
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark import SparkConf
conf = SparkConf()
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.sql import SparkSession
import sys
# Wget is a free GNU command-line utility tool used to download files from the internet.
# Install oped JDK 8 for PySpark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
from pyspark import SparkConf
conf = SparkConf()
master=conf.setMaster('yarn-client')
app=conf.setAppName('anaconda-pyspark')
conf.set('yarn-client', 'anaconda-pyspark') # Optional configurations
spark=SparkSession.builder.appName('Practise').getOrCreate()
pyspark
df=spark.read.csv('Assets.csv')
df=spark.read.option('header','true').csv('Assets.csv')
type(df)
df.printSchema()
df
# Display PySpark dataframe 3 rows
df.show(3)
df.describe().show()
### FILTER
### Rows of the bank names with year equal to 2017
df.filter("Year=2017").show()
# Max by Aggregation
df.groupBy('Bank Name').agg({'Employees':'sum'}).show()
# Sum by Aggregation
group_data = df.groupBy("Bank Name")
group_data.agg({'Employees':'sum'}).show()
# Change name of columns with alias
df.select(countDistinct("Employees").alias("Distinct Employees")).show()
# Use 2 decimal places
from pyspark.sql.functions import format_number
sales_std = df.select(stddev("Employees").alias('std'))
# format_number("col_name",decimal places)
sales_std.select(format_number('std',2).alias('std_2digits')).show()
https://www.guru99.com/pyspark-tutorial.html
https://medium.com/swlh/pyspark-on-macos-installation-and-use-31f84ca61400
https://stackoverflow.com/questions/63216201/how-to-install-python3-9-with-conda
https://docs.anaconda.com/anaconda-scale/howto/spark-configuration/#scale-spark-config-sparkcontext
https://docs.datastax.com/en/jdk-install/doc/jdk-install/installOpenJdkDeb.html
https://docs.anaconda.com/anaconda-scale/howto/spark-configuration/
https://www.dataquest.io/blog/pyspark-installation-guide/
https://notadatascientist.com/install-spark-on-macos/
Comments