115 lines
3.5 KiB
Python
115 lines
3.5 KiB
Python
# file: sales_prediction_proxy.py
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.neighbors import NearestNeighbors
|
|
from xgboost import XGBRegressor
|
|
from scipy.sparse import hstack
|
|
import joblib
|
|
|
|
# =====================
|
|
# Load Data
|
|
# =====================
|
|
data_path = r"C:\Users\joao.herculano\Documents\products.csv" # Update with your path
|
|
df = pd.read_csv(data_path)
|
|
|
|
# =====================
|
|
# Preprocessing Pipeline
|
|
# =====================
|
|
categorical_features = ['brand', 'category', 'subcategory']
|
|
numeric_features = ['price']
|
|
text_feature = 'description'
|
|
target = 'sales_in_release'
|
|
|
|
df[text_feature] = df[text_feature].fillna('')
|
|
|
|
text_vectorizer = TfidfVectorizer(max_features=100)
|
|
categorical_transformer = Pipeline([
|
|
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
|
|
('onehot', OneHotEncoder(handle_unknown='ignore'))
|
|
])
|
|
numeric_transformer = Pipeline([
|
|
('imputer', SimpleImputer(strategy='mean')),
|
|
('scaler', StandardScaler())
|
|
])
|
|
|
|
preprocessor = ColumnTransformer([
|
|
('text', text_vectorizer, text_feature),
|
|
('cat', categorical_transformer, categorical_features),
|
|
('num', numeric_transformer, numeric_features)
|
|
])
|
|
|
|
# =====================
|
|
# Fit Preprocessing
|
|
# =====================
|
|
X = preprocessor.fit_transform(df)
|
|
y = df[target].values
|
|
|
|
# Save preprocessors
|
|
joblib.dump(preprocessor, 'models/preprocessor.joblib')
|
|
|
|
# =====================
|
|
# Train Sales Prediction Model
|
|
# =====================
|
|
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
|
|
xgb_model.fit(X, y)
|
|
joblib.dump(xgb_model, 'models/sales_predictor.joblib')
|
|
|
|
# =====================
|
|
# Fit Proxy Finder Model
|
|
# =====================
|
|
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
|
|
knn.fit(X)
|
|
joblib.dump(knn, 'models/proxy_finder.joblib')
|
|
|
|
# =====================
|
|
# Functions
|
|
# =====================
|
|
def predict_sales(new_product: dict) -> float:
|
|
preprocessor = joblib.load('models/preprocessor.joblib')
|
|
model = joblib.load('models/sales_predictor.joblib')
|
|
|
|
df_new = pd.DataFrame([new_product])
|
|
df_new['description'] = df_new['description'].fillna('')
|
|
X_new = preprocessor.transform(df_new)
|
|
prediction = model.predict(X_new)
|
|
return float(prediction[0])
|
|
|
|
def get_proxies(new_product: dict, top_n=5) -> list:
|
|
preprocessor = joblib.load('models/preprocessor.joblib')
|
|
knn = joblib.load('models/proxy_finder.joblib')
|
|
|
|
df_new = pd.DataFrame([new_product])
|
|
df_new['description'] = df_new['description'].fillna('')
|
|
X_new = preprocessor.transform(df_new)
|
|
|
|
distances, indices = knn.kneighbors(X_new, n_neighbors=top_n)
|
|
|
|
proxies = df.iloc[indices[0]].copy()
|
|
proxies['distance'] = distances[0]
|
|
return proxies.to_dict(orient='records')
|
|
|
|
# =====================
|
|
# Example Usage
|
|
# =====================
|
|
if __name__ == '__main__':
|
|
new_product = {
|
|
'description': 'Wireless Noise Cancelling Headphones',
|
|
'brand': 'SoundCore',
|
|
'category': 'Electronics',
|
|
'subcategory': 'Headphones',
|
|
'price': 99.99
|
|
}
|
|
|
|
sales_estimate = predict_sales(new_product)
|
|
print(f"Predicted Sales: {sales_estimate}")
|
|
|
|
proxies = get_proxies(new_product, top_n=5)
|
|
for proxy in proxies:
|
|
print(f"Proxy: {proxy['description']} | Brand: {proxy['brand']} | Sold: {proxy['sales_in_release']} | Distance: {proxy['distance']:.4f}")
|