Ruptura_Projetada/previsão de lançamento/sales_prediction_proxy.py

# file: sales_prediction_proxy.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
from xgboost import XGBRegressor
from scipy.sparse import hstack
import joblib

# =====================
# Load Data
# =====================
data_path = r"C:\Users\joao.herculano\Documents\products.csv" # Update with your path
df = pd.read_csv(data_path)

# =====================
# Preprocessing Pipeline
# =====================
categorical_features = ['brand', 'category', 'subcategory']
numeric_features = ['price']
text_feature = 'description'
target = 'sales_in_release'

df[text_feature] = df[text_feature].fillna('')

text_vectorizer = TfidfVectorizer(max_features=100)
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('text', text_vectorizer, text_feature),
    ('cat', categorical_transformer, categorical_features),
    ('num', numeric_transformer, numeric_features)
])

# =====================
# Fit Preprocessing
# =====================
X = preprocessor.fit_transform(df)
y = df[target].values

# Save preprocessors
joblib.dump(preprocessor, 'models/preprocessor.joblib')

# =====================
# Train Sales Prediction Model
# =====================
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X, y)
joblib.dump(xgb_model, 'models/sales_predictor.joblib')

# =====================
# Fit Proxy Finder Model
# =====================
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(X)
joblib.dump(knn, 'models/proxy_finder.joblib')

# =====================
# Functions
# =====================
def predict_sales(new_product: dict) -> float:
    preprocessor = joblib.load('models/preprocessor.joblib')
    model = joblib.load('models/sales_predictor.joblib')

    df_new = pd.DataFrame([new_product])
    df_new['description'] = df_new['description'].fillna('')
    X_new = preprocessor.transform(df_new)
    prediction = model.predict(X_new)
    return float(prediction[0])

def get_proxies(new_product: dict, top_n=5) -> list:
    preprocessor = joblib.load('models/preprocessor.joblib')
    knn = joblib.load('models/proxy_finder.joblib')

    df_new = pd.DataFrame([new_product])
    df_new['description'] = df_new['description'].fillna('')
    X_new = preprocessor.transform(df_new)

    distances, indices = knn.kneighbors(X_new, n_neighbors=top_n)

    proxies = df.iloc[indices[0]].copy()
    proxies['distance'] = distances[0]
    return proxies.to_dict(orient='records')

# =====================
# Example Usage
# =====================
if __name__ == '__main__':
    new_product = {
        'description': 'Wireless Noise Cancelling Headphones',
        'brand': 'SoundCore',
        'category': 'Electronics',
        'subcategory': 'Headphones',
        'price': 99.99
    }

    sales_estimate = predict_sales(new_product)
    print(f"Predicted Sales: {sales_estimate}")

    proxies = get_proxies(new_product, top_n=5)
    for proxy in proxies:
        print(f"Proxy: {proxy['description']} | Brand: {proxy['brand']} | Sold: {proxy['sales_in_release']} | Distance: {proxy['distance']:.4f}")