Replace missing values using numpy and pandas.
Treating missing values
In [16]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
Figuring out what data is missing
In [17]:
missing = np.nan
series_obj = Series(['row 1', 'row 2', missing, 'row 4', 'row 5', 'row 6', missing, 'row 8'])
series_obj
Out[17]:
0 row 1 1 row 2 2 NaN 3 row 4 4 row 5 5 row 6 6 NaN 7 row 8 dtype: object
In [18]:
series_obj.isnull()
Out[18]:
0 False 1 False 2 True 3 False 4 False 5 False 6 True 7 False dtype: bool
Filling in for missing values
In [19]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape(6,6))
DF_obj
Out[19]:
012345
00.8701240.5822770.2788390.1859110.4111000.117376
10.6849690.4376110.5562290.3670800.4023660.113041
20.4470310.5854450.1619850.5207190.3260510.699186
30.3663950.8363750.4813430.5165020.3830480.997541
40.5142440.5590530.0344500.7199300.4210040.436935
50.2817010.9002740.6696120.4560690.2898040.525819
In [20]:
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj
Out[20]:
012345
00.8701240.5822770.2788390.1859110.4111000.117376
10.6849690.4376110.5562290.3670800.402366NaN
20.4470310.5854450.1619850.5207190.326051NaN
3NaN0.8363750.4813430.5165020.383048NaN
4NaN0.5590530.0344500.7199300.421004NaN
5NaN0.9002740.6696120.4560690.2898040.525819
In [21]:
filled_DF = DF_obj.fillna(0)
filled_DF
Out[21]:
012345
00.8701240.5822770.2788390.1859110.4111000.117376
10.6849690.4376110.5562290.3670800.4023660.000000
20.4470310.5854450.1619850.5207190.3260510.000000
30.0000000.8363750.4813430.5165020.3830480.000000
40.0000000.5590530.0344500.7199300.4210040.000000
50.0000000.9002740.6696120.4560690.2898040.525819
In [22]:
filled_DF = DF_obj.fillna({0: 0.1, 5:1.25})
filled_DF
Out[22]:
012345
00.8701240.5822770.2788390.1859110.4111000.117376
10.6849690.4376110.5562290.3670800.4023661.250000
20.4470310.5854450.1619850.5207190.3260511.250000
30.1000000.8363750.4813430.5165020.3830481.250000
40.1000000.5590530.0344500.7199300.4210041.250000
50.1000000.9002740.6696120.4560690.2898040.525819
In [23]:
fill_DF = DF_obj.fillna(method='ffill')
fill_DF
Out[23]:
012345
00.8701240.5822770.2788390.1859110.4111000.117376
10.6849690.4376110.5562290.3670800.4023660.117376
20.4470310.5854450.1619850.5207190.3260510.117376
30.4470310.8363750.4813430.5165020.3830480.117376
40.4470310.5590530.0344500.7199300.4210040.117376
50.4470310.9002740.6696120.4560690.2898040.525819
Counting missing values
In [24]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape(6,6))
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj
Out[24]:
012345
00.8701240.5822770.2788390.1859110.4111000.117376
10.6849690.4376110.5562290.3670800.402366NaN
20.4470310.5854450.1619850.5207190.326051NaN
3NaN0.8363750.4813430.5165020.383048NaN
4NaN0.5590530.0344500.7199300.421004NaN
5NaN0.9002740.6696120.4560690.2898040.525819
In [25]:
DF_obj.isnull().sum()
Out[25]:
0 3 1 0 2 0 3 0 4 0 5 4 dtype: int64
Filtering out missing values
In [26]:
DF_no_NaN = DF_obj.dropna()
DF_no_NaN
Out[26]:
012345
00.8701240.5822770.2788390.1859110.41110.117376
In [27]:
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN
Out[27]:
1234
00.5822770.2788390.1859110.411100
10.4376110.5562290.3670800.402366
20.5854450.1619850.5207190.326051
30.8363750.4813430.5165020.383048
40.5590530.0344500.7199300.421004
50.9002740.6696120.4560690.289804
Comments