Ruptura_Projetada/previsão de lançamento/sales_prediction_proxy.py
2025-10-24 15:54:54 -03:00

115 lines
3.5 KiB
Python

# file: sales_prediction_proxy.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
from xgboost import XGBRegressor
from scipy.sparse import hstack
import joblib
# =====================
# Load Data
# =====================
data_path = r"C:\Users\joao.herculano\Documents\products.csv" # Update with your path
df = pd.read_csv(data_path)
# =====================
# Preprocessing Pipeline
# =====================
categorical_features = ['brand', 'category', 'subcategory']
numeric_features = ['price']
text_feature = 'description'
target = 'sales_in_release'
df[text_feature] = df[text_feature].fillna('')
text_vectorizer = TfidfVectorizer(max_features=100)
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numeric_transformer = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
('text', text_vectorizer, text_feature),
('cat', categorical_transformer, categorical_features),
('num', numeric_transformer, numeric_features)
])
# =====================
# Fit Preprocessing
# =====================
X = preprocessor.fit_transform(df)
y = df[target].values
# Save preprocessors
joblib.dump(preprocessor, 'models/preprocessor.joblib')
# =====================
# Train Sales Prediction Model
# =====================
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X, y)
joblib.dump(xgb_model, 'models/sales_predictor.joblib')
# =====================
# Fit Proxy Finder Model
# =====================
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(X)
joblib.dump(knn, 'models/proxy_finder.joblib')
# =====================
# Functions
# =====================
def predict_sales(new_product: dict) -> float:
preprocessor = joblib.load('models/preprocessor.joblib')
model = joblib.load('models/sales_predictor.joblib')
df_new = pd.DataFrame([new_product])
df_new['description'] = df_new['description'].fillna('')
X_new = preprocessor.transform(df_new)
prediction = model.predict(X_new)
return float(prediction[0])
def get_proxies(new_product: dict, top_n=5) -> list:
preprocessor = joblib.load('models/preprocessor.joblib')
knn = joblib.load('models/proxy_finder.joblib')
df_new = pd.DataFrame([new_product])
df_new['description'] = df_new['description'].fillna('')
X_new = preprocessor.transform(df_new)
distances, indices = knn.kneighbors(X_new, n_neighbors=top_n)
proxies = df.iloc[indices[0]].copy()
proxies['distance'] = distances[0]
return proxies.to_dict(orient='records')
# =====================
# Example Usage
# =====================
if __name__ == '__main__':
new_product = {
'description': 'Wireless Noise Cancelling Headphones',
'brand': 'SoundCore',
'category': 'Electronics',
'subcategory': 'Headphones',
'price': 99.99
}
sales_estimate = predict_sales(new_product)
print(f"Predicted Sales: {sales_estimate}")
proxies = get_proxies(new_product, top_n=5)
for proxy in proxies:
print(f"Proxy: {proxy['description']} | Brand: {proxy['brand']} | Sold: {proxy['sales_in_release']} | Distance: {proxy['distance']:.4f}")