Geek Culture

A new tech publication by Start it up (https://medium.com/swlh).

Follow publication

Mobile Price Classification: An Open Source Data Science Project with Dagshub

Davis David
Geek Culture
Published in
13 min readDec 23, 2022

--

Photo by Rann Vijay from Pexels:

How to Create a Project using Dagshub

Screenshot from the Dagshub platform (Author).

Mobile Price Dataset

Packages Installation

Import Python Packages

# import packages

import
pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import mlflow

mlflow.sklearn.autolog() # set autlog for sklearn
mlflow.set_experiment('Ml-classification-experiment')
import joblib
import json
import os

np.random.seed(1234)

Load and Version the Mobile Price Dataset

raw_data = pd.read_csv("data/raw/data.csv")
Screenshot from Dagshub (Author).
dvc remote add origin https://dagshub.com/Davisy/Mobile-Price-ML-Classification-Project.dvc
dvc commit -f data / raw.dvc
print(raw_data.shape)
features = raw_data.drop(['price_range'], axis=1)

target = raw_data.price_range.values

Data Preprocessing

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
X_train, X_valid, y_train, y_valid = train_test_split(features_scaled, target, test_size=0.2,
stratify=target,
random_state=1)
print(X_train[0])[ 1.56947055 -0.9900495   1.32109556 -1.01918398  0.15908825 -1.04396559
-1.49088996 1.03435682 0.61459469 0.20963905 1.00341448 -0.93787756
-0.57283137 -1.3169798 0.40204724 1.43112714 0.73023981 0.55964063 0.99401789 0.98609664]
# create a dataframe for train set
X_train_df = pd.DataFrame(X_train, columns=list(features.columns))
y_train_df = pd.DataFrame(y_train, columns=["price_range"])

#combine features and target for train set
train_df = pd.concat([X_train_df, y_train_df], axis=1)

# create a dataframe for traine set
X_valid_df = pd.DataFrame(X_valid, columns=list(features.columns))
y_valid_df = pd.DataFrame(y_valid, columns=["price_range"])
#combine features and target for train set
valid_df = pd.concat([X_valid_df, y_valid_df], axis=1)
# save processed train and valid set
train_df.to_csv('data/processed/data_train.csv', index_label='Index')
valid_df.to_csv('data/processed/data_valid.csv', index_label='Index')
dvc commit -f process_data.dvc
# save the trained scaler
joblib.dump(scaler, 'model/mobile_price_scaler.pkl')

Training Machine Learning Algorithms

screenshot from the Dagshub repository (Author).
# using MLflow tracking

mlflow.set_tracking_uri("https://dagshub.com/Davisy/Mobile-Price-ML-Classification-Project.mlflow")

os.environ["MLFLOW_TRACKING_USERNAME"] = "username"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "password"
# load the processed data for both train and valid set

X_train = train_df[train_df.columns[:-1]]
y_train = train_df['price_range']

X_valid = valid_df[valid_df.columns[:-1]]
y_valid = valid_df['price_range']
# train randomforest algorithm

rf_classifier = RandomForestClassifier(n_estimators=200, criterion="gini")

with mlflow.start_run():
#train the model
rf_classifier.fit(X_train, y_train)

#make predictions
y_pred = rf_classifier.predict(X_valid)

#check performance
score = accuracy_score(y_pred, y_valid)

mlflow.end_run()

print(score)
# train logistic regression algorithm

lg_classifier = LogisticRegression(penalty='l2', C=1.0)

with mlflow.start_run():
#train the model
lg_classifier.fit(X_train, y_train)

#make predictions
y_pred = lg_classifier.predict(X_valid)

#check performance
score = accuracy_score(y_pred, y_valid)

mlflow.end_run()

print(score)
Screenshot from the Dagshub repository(Author).
dvc commit -f model.dvc

Register the Best Model with MLflow

# Grab the run ID
run_id = '17ccd85b4c7e491bbdbcba58b5eafae1'

# Select a subpath name for the run
subpath = "best_model"

# Select a name for the model to be registered
model_name = "Logistic Regression Model"

# build the run URI
run_uri = f'runs:/{run_id}/{subpath}'

# register the model
model_version = mlflow.register_model(run_uri, model_name)
Successfully registered model 'Logistic Regression Model'.
2022/11/10 00:22:33 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Logistic Regression Model, version 1
Created version '1' of model 'Logistic Regression Model'.

Deploy logged Model in MLflow with Streamlit

Create app.py file

Import Packages

# import packages
import
streamlit as st
import pandas as pd
import numpy as np
from os.path import dirname, join, realpath
import joblib

Create App Title and Description

# add banner image
st.header("Mobile Price Prediction")
st.image("images/phones.jpg")
st.subheader(
"""
A simple machine learning app to classify mobile price range
"""
)

Create a Form to Receive a Mobile’s details

# form to collect mobile phone details
my_form = st.form(key="mobile_form")


@st.cache
# function to transform Yes and No options
def
func(value):
if value == 1:
return "Yes"
else:
return "No"


battery_power = my_form.number_input(
"Total energy a battery can store in one time measured in mAh", min_value=500
)
blue = my_form.selectbox("Has bluetooth or not", (0, 1), format_func=func)

clock_speed = my_form.number_input(
"speed at which microprocessor executes instructions", min_value=1
)

dual_sim = my_form.selectbox("Has dual sim support or not", (0, 1), format_func=func)

fc = my_form.number_input(
"Front Camera mega pixels", min_value=0
)

four_g = my_form.selectbox("Has 4G or not", (0, 1), format_func=func)

int_memory = my_form.number_input(
"Internal Memory in Gigabytes", min_value=2
)

m_dep = my_form.number_input(
"Mobile Depth in cm", min_value=0
)

mobile_wt = my_form.number_input(
"Weight of mobile phone", min_value=80
)

n_cores = my_form.number_input(
"Number of cores of processor", min_value=1
)
pc = my_form.number_input(
"Primary Camera mega pixels", min_value=0
)

px_height = my_form.number_input(
"Pixel Resolution Height", min_value=0
)

px_width = my_form.number_input(
"Pixel Resolution Width", min_value=0
)

ram = my_form.number_input(
"Random Access Memory in Mega Bytes", min_value=256
)

sc_h = my_form.number_input(
"Screen Height of mobile in cm", min_value=5
)

sc_w = my_form.number_input(
"Screen Width of mobile in cm", min_value=0
)

talk_time = my_form.number_input(
"longest time that a single battery charge will last when you are", min_value=2
)

three_g = my_form.selectbox("Has 3G or not", (0, 1), format_func=func)

touch_screen = my_form.selectbox("Has touch screen or not", (0, 1), format_func=func)

wifi = my_form.selectbox("Has wifi or not", (0, 1), format_func=func)

submit = my_form.form_submit_button(label="make prediction")

Load logged Model in MLflow and Scaler

# load the mlflow registered model and scaler
mlflow_model_path = "mlruns/1/17ccd85b4c7e491bbdbcba58b5eafae1/artifacts/model/model.pkl"
with open(
join(dirname(realpath(__file__)), mlflow_model_path),
"rb",
) as f:
model = joblib.load(f)
scaler_path = "model/mobile_price_scaler.pkl"
with open(join(dirname(realpath(__file__)), scaler_path ), "rb") as f:
scaler = joblib.load(f)

Create Result Dictionary

# result dictionary
result_dict = {
0: "Low Cost",
1: "Medium Cost",
2: "High Cost",
3: "Very High Cost",
}

Make Predictions and Show Results

if submit:
# collect inputs
input = {
'battery_power': battery_power,
'blue': blue,
'clock_speed': clock_speed,
'dual_sim': dual_sim,
'fc': fc,
'four_g': four_g,
'int_memory': int_memory,
'm_dep': m_dep,
'mobile_wt': mobile_wt,
'n_cores': n_cores,
'pc': pc,
'px_height': px_height,
'px_width': px_width,
'ram': ram,
'sc_h': sc_h,
'sc_w': sc_w,
'talk_time': talk_time,
'three_g': three_g,
'touch_screen': touch_screen,
'wifi': wifi,
}

# create a dataframe
data = pd.DataFrame(input, index=[0])

# transform input
data_scaled = scaler.transform(data)

# perform prediction
prediction = model.predict(data_scaled)
output = int(prediction[0])

# Display results of the Mobile price prediction
st.header("Results")
st.write(" Price range is {} ".format(result_dict[output]))

Test the Data Science Web App

streamlit run app.py
Screenshot from the web app (Author).
Screenshot from the Streamlit Web app (Author).

Deploy Streamlit Web App in the Streamlit Cloud

screenshot from Streamlit Cloud.Image from Author (Author).

Conclusion

Free

Distraction-free reading. No ads.

Organize your knowledge with lists and highlights.

Tell your story. Find your audience.

Membership

Read member-only stories

Support writers you read most

Earn money for your writing

Listen to audio narrations

Read offline with the Medium app

--

--

Davis David
Davis David

Written by Davis David

Data Scientist 📊 | Software Developer | Technical Writer 📝 | ML Course Author 👨🏽‍💻 | Giving talks. Check my new ML course: https://bit.ly/OptimizeMLModels

Responses (1)

Write a response