top of page

PySpark - Linear regression for categorical features


# PySpark

# LINEAR REGRESSION FOR CATEGORICAL FEATURES

!pip install pyspark

pip install findspark

import findspark

# Use pandas to import csv

import pandas as pd

import os

from pyspark import SparkContext

from pyspark.sql import SQLContext

from pyspark import SparkConf

conf = SparkConf()

findspark.init()

import pyspark # only run after findspark.init()

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('Missing').getOrCreate()

## Read The dataset

df = spark.read.csv('tips.csv',header=True,inferSchema=True)

df.show()

df.printSchema()


df.columns

['total_bill', 'tip', 'sex', 'vegan', 'day', 'time', 'size']


# CATEGORICAL FEATURES

from pyspark.ml.feature import StringIndexer


indexer=StringIndexer(inputCol="sex",outputCol="sex_indexed")

df_r=indexer.fit(df).transform(df)

df_r.show()


# String indexer

indexer=StringIndexer(inputCols=["vegan","day","time"],outputCols=["vegan_indexed","day_indexed",

"time_index"])

df_r=indexer.fit(df_r).transform(df_r)

df_r.show()


df_r.columns


# Vector Assembler - transformer that combines a given list of columns into a single vector column

from pyspark.ml.feature import VectorAssembler

featureassembler=VectorAssembler(inputCols=['tip','size','sex_indexed','vegan_indexed','day_indexed',

'time_index'],outputCol="Independent Features")

output=featureassembler.transform(df_r)


output.select('Independent Features').show()


output.show()


finalized_data=output.select("Independent Features","total_bill")


finalized_data.show()


from pyspark.ml.regression import LinearRegression

# train test split

# train - 75, validation

# test - 25, cross validation

train_data,test_data=finalized_data.randomSplit([0.75,0.25])

regressor=LinearRegression(featuresCol='Independent Features', labelCol='total_bill')

regressor=regressor.fit(train_data)


# Coefficients - y = 3X + 5. +3 is the coefficient, X is the predictor, and +5 is the constant.

regressor.coefficients

DenseVector([3.371, 3.08, -1.0656, 2.5811, -0.202, -0.4421])


# Intercept - y, constant - the value at which the regression line crosses the y-axis.

regressor.intercept

1.4262345958610814


# Predictions

pred_results=regressor.evaluate(test_data)


# Final comparison

# Prediction

pred_results.predictions.show()


# Performance Metrics

# r2 - r squared - coeficient of determination - how good a model fits a given dataset between 0 and 1, 0 doesnt fit, 1 fully fit

# mean Absolute Error - MAE - it doesn't punish huge errors. It is usually used when the performance is measured on continuous variable data. It gives a linear value, which averages the weighted individual differences equally. The lower the value, better is the model's performance.

# mean Square Error - MSE - It is one of the most commonly used metrics, but least useful when a single bad prediction would ruin the entire model's predicting abilities, i.e when the dataset contains a lot of noise. It is most useful when the dataset contains outliers, or unexpected values (too high or too low values).


pred_results.r2,pred_results.meanAbsoluteError,pred_results.meanSquaredError

(0.5298873487374176, 4.687624854268287, 40.64130645461818)






https://www.researchgate.net/publication/221472805_Feature_Extraction_for_Regression_Problems_and_an_Example_Application_for_Pose_Estimation_of_a_Face

https://medium.com/@nutanbhogendrasharma/feature-transformer-vectorassembler-in-pyspark-ml-feature-part-3-b3c2c3c93ee9


22 views0 comments

Recent Posts

See All

Python - sktime

There are various libraries created for Python Time Series. Each of them has its own style, contributors and functions. Each library has...

Commenti


bottom of page