PySpark - Linear regression for categorical features

Mar 3, 2022

# PySpark

# LINEAR REGRESSION FOR CATEGORICAL FEATURES

!pip install pyspark

pip install findspark

import findspark

# Use pandas to import csv

import pandas as pd

import os

from pyspark import SparkContext

from pyspark.sql import SQLContext

from pyspark import SparkConf

conf = SparkConf()

findspark.init()

import pyspark # only run after findspark.init()

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('Missing').getOrCreate()

## Read The dataset

df = spark.read.csv('tips.csv',header=True,inferSchema=True)

df.show()

df.printSchema()

df.columns

['total_bill', 'tip', 'sex', 'vegan', 'day', 'time', 'size']

# CATEGORICAL FEATURES

from pyspark.ml.feature import StringIndexer

indexer=StringIndexer(inputCol="sex",outputCol="sex_indexed")

df_r=indexer.fit(df).transform(df)

df_r.show()

# String indexer

indexer=StringIndexer(inputCols=["vegan","day","time"],outputCols=["vegan_indexed","day_indexed",

"time_index"])

df_r=indexer.fit(df_r).transform(df_r)

df_r.show()

df_r.columns

# Vector Assembler - transformer that combines a given list of columns into a single vector column

from pyspark.ml.feature import VectorAssembler

featureassembler=VectorAssembler(inputCols=['tip','size','sex_indexed','vegan_indexed','day_indexed',

'time_index'],outputCol="Independent Features")

output=featureassembler.transform(df_r)

output.select('Independent Features').show()

output.show()

finalized_data=output.select("Independent Features","total_bill")

finalized_data.show()

from pyspark.ml.regression import LinearRegression

# train test split

# train - 75, validation

# test - 25, cross validation

train_data,test_data=finalized_data.randomSplit([0.75,0.25])

regressor=LinearRegression(featuresCol='Independent Features', labelCol='total_bill')

regressor=regressor.fit(train_data)

# Coefficients - y = 3X + 5. +3 is the coefficient, X is the predictor, and +5 is the constant.

regressor.coefficients

DenseVector([3.371, 3.08, -1.0656, 2.5811, -0.202, -0.4421])

# Intercept - y, constant - the value at which the regression line crosses the y-axis.

regressor.intercept

1.4262345958610814

# Predictions

pred_results=regressor.evaluate(test_data)

# Final comparison

# Prediction

pred_results.predictions.show()

# Performance Metrics

# r2 - r squared - coeficient of determination - how good a model fits a given dataset between 0 and 1, 0 doesnt fit, 1 fully fit

# mean Absolute Error - MAE - it doesn't punish huge errors. It is usually used when the performance is measured on continuous variable data. It gives a linear value, which averages the weighted individual differences equally. The lower the value, better is the model's performance.

# mean Square Error - MSE - It is one of the most commonly used metrics, but least useful when a single bad prediction would ruin the entire model's predicting abilities, i.e when the dataset contains a lot of noise. It is most useful when the dataset contains outliers, or unexpected values (too high or too low values).

pred_results.r2,pred_results.meanAbsoluteError,pred_results.meanSquaredError

(0.5298873487374176, 4.687624854268287, 40.64130645461818)

https://www.studytonight.com/post/what-is-mean-squared-error-mean-absolute-error-root-mean-squared-error-and-r-squared

https://www.researchgate.net/publication/221472805_Feature_Extraction_for_Regression_Problems_and_an_Example_Application_for_Pose_Estimation_of_a_Face

https://medium.com/@nutanbhogendrasharma/feature-transformer-vectorassembler-in-pyspark-ml-feature-part-3-b3c2c3c93ee9

https://george-jen.gitbook.io/data-science-and-apache-spark/vectorassembler

https://s3.amazonaws.com/assets.datacamp.com/production/course_14989/slides/chapter3.pdf

https://books.google.cz/books?id=bfhgEAAAQBAJ&pg=PA61&lpg=PA61&dq=regressor.coefficients+DenseVector&source=bl&ots=05rl7STVVn&sig=ACfU3U2pqwnFufTOVC5-So_4lWd_MyBzFQ&hl=en&sa=X&ved=2ahUKEwj87fCioqr2AhWMwQIHHTk1CTg4FBDoAXoECAsQAw#v=onepage&q=regressor.coefficients%20DenseVector&f=false

https://link.springer.com/chapter/10.1007/978-3-540-69812-8_43

https://machinelearningmastery.com/feature-selection-for-regression-data/

http://mipal.snu.ac.kr/images/5/54/LDAR_ICIAR.pdf