# PySpark
# LINEAR REGRESSION FOR CATEGORICAL FEATURES
!pip install pyspark
pip install findspark
import findspark
# Use pandas to import csv
import pandas as pd
import os
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark import SparkConf
conf = SparkConf()
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Missing').getOrCreate()
## Read The dataset
df = spark.read.csv('tips.csv',header=True,inferSchema=True)
df.show()
df.printSchema()
df.columns
['total_bill', 'tip', 'sex', 'vegan', 'day', 'time', 'size']
# CATEGORICAL FEATURES
from pyspark.ml.feature import StringIndexer
indexer=StringIndexer(inputCol="sex",outputCol="sex_indexed")
df_r=indexer.fit(df).transform(df)
df_r.show()
# String indexer
indexer=StringIndexer(inputCols=["vegan","day","time"],outputCols=["vegan_indexed","day_indexed",
"time_index"])
df_r=indexer.fit(df_r).transform(df_r)
df_r.show()
df_r.columns
# Vector Assembler - transformer that combines a given list of columns into a single vector column
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['tip','size','sex_indexed','vegan_indexed','day_indexed',
'time_index'],outputCol="Independent Features")
output=featureassembler.transform(df_r)
output.select('Independent Features').show()
output.show()
finalized_data=output.select("Independent Features","total_bill")
finalized_data.show()
from pyspark.ml.regression import LinearRegression
# train test split
# train - 75, validation
# test - 25, cross validation
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='total_bill')
regressor=regressor.fit(train_data)
# Coefficients - y = 3X + 5. +3 is the coefficient, X is the predictor, and +5 is the constant.
regressor.coefficients
DenseVector([3.371, 3.08, -1.0656, 2.5811, -0.202, -0.4421])
# Intercept - y, constant - the value at which the regression line crosses the y-axis.
regressor.intercept
1.4262345958610814
# Predictions
pred_results=regressor.evaluate(test_data)
# Final comparison
# Prediction
pred_results.predictions.show()
# Performance Metrics
# r2 - r squared - coeficient of determination - how good a model fits a given dataset between 0 and 1, 0 doesnt fit, 1 fully fit
# mean Absolute Error - MAE - it doesn't punish huge errors. It is usually used when the performance is measured on continuous variable data. It gives a linear value, which averages the weighted individual differences equally. The lower the value, better is the model's performance.
# mean Square Error - MSE - It is one of the most commonly used metrics, but least useful when a single bad prediction would ruin the entire model's predicting abilities, i.e when the dataset contains a lot of noise. It is most useful when the dataset contains outliers, or unexpected values (too high or too low values).
pred_results.r2,pred_results.meanAbsoluteError,pred_results.meanSquaredError
(0.5298873487374176, 4.687624854268287, 40.64130645461818)
https://www.researchgate.net/publication/221472805_Feature_Extraction_for_Regression_Problems_and_an_Example_Application_for_Pose_Estimation_of_a_Face
https://medium.com/@nutanbhogendrasharma/feature-transformer-vectorassembler-in-pyspark-ml-feature-part-3-b3c2c3c93ee9
Comments