-
Notifications
You must be signed in to change notification settings - Fork 27
Description
tried serializing the below pipepline
if I try remove the init argument from serialize to bundle
error is
"AttributeError: 'OutletTypeEncoder' object has no attribute 'op'"
importing required libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
#from sklearn.preprocessing import StandardScaler, MinMaxScaler, Imputer, Binarizer, PolynomialFeatures
from sklearn.pipeline import Pipeline
import mleap.sklearn.pipeline
import mleap.sklearn.feature_union
import mleap.sklearn.base
import mleap.sklearn.logistic
import mleap.sklearn.preprocessing
read the training data set
data = pd.read_csv('market.csv')
top rows of the data
#print(data.head(5))
seperate the independent and target variables
train_x = data.drop(columns=['Item_Outlet_Sales'])
train_y = data['Item_Outlet_Sales']
import the BaseEstimator
from sklearn.base import BaseEstimator
define the class OutletTypeEncoder
This will be our custom transformer that will create 3 new binary columns
custom transformer must have methods fit and transform
class OutletTypeEncoder(BaseEstimator):
def __init__(self):
pass
def fit(self, documents, y=None):
return self
def transform(self, x_dataset):
x_dataset['outlet_grocery_store'] = (x_dataset['Outlet_Type'] == 'Grocery Store') * 1
x_dataset['outlet_supermarket_3'] = (x_dataset['Outlet_Type'] == 'Supermarket Type3') * 1
x_dataset['outlet_identifier_OUT027'] = (x_dataset['Outlet_Identifier'] == 'OUT027') * 1
return x_dataset
pre-processsing step
Drop the columns -
Impute the missing values in column Item_Weight by mean
Scale the data in the column Item_MRP
pre_process = ColumnTransformer(remainder='passthrough',
transformers=[('drop_columns', 'drop', ['Item_Identifier',
'Outlet_Identifier',
'Item_Fat_Content',
'Item_Type',
'Outlet_Identifier',
'Outlet_Size',
'Outlet_Location_Type',
'Outlet_Type'
]),
('impute_item_weight', SimpleImputer(strategy='mean'), ['Item_Weight']),
('scale_data', StandardScaler(),['Item_MRP'])])
Define the Pipeline
"""
Step1: get the oultet binary columns
Step2: pre processing
Step3: Train a Random Forest Model
"""
model_pipeline = Pipeline(steps=[('get_outlet_binary_columns', OutletTypeEncoder()),
('pre_processing',pre_process),
('random_forest', RandomForestRegressor(max_depth=10,random_state=2))
])
fit the pipeline with the training data
model_pipeline.fit(train_x,train_y)
read the test data
test_data = pd.read_csv('test.csv')
predict target variables on the test data
#model_pipeline.predict(test_data)
Serialiaze the random forest model
model_pipeline.serialize_to_bundle('/tmp', 'market.rf', init=True)