# file: sales_prediction_proxy.py import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.neighbors import NearestNeighbors from xgboost import XGBRegressor from scipy.sparse import hstack import joblib # ===================== # Load Data # ===================== data_path = r"C:\Users\joao.herculano\Documents\products.csv" # Update with your path df = pd.read_csv(data_path) # ===================== # Preprocessing Pipeline # ===================== categorical_features = ['brand', 'category', 'subcategory'] numeric_features = ['price'] text_feature = 'description' target = 'sales_in_release' df[text_feature] = df[text_feature].fillna('') text_vectorizer = TfidfVectorizer(max_features=100) categorical_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) numeric_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()) ]) preprocessor = ColumnTransformer([ ('text', text_vectorizer, text_feature), ('cat', categorical_transformer, categorical_features), ('num', numeric_transformer, numeric_features) ]) # ===================== # Fit Preprocessing # ===================== X = preprocessor.fit_transform(df) y = df[target].values # Save preprocessors joblib.dump(preprocessor, 'models/preprocessor.joblib') # ===================== # Train Sales Prediction Model # ===================== xgb_model = XGBRegressor(n_estimators=100, random_state=42) xgb_model.fit(X, y) joblib.dump(xgb_model, 'models/sales_predictor.joblib') # ===================== # Fit Proxy Finder Model # ===================== knn = NearestNeighbors(n_neighbors=5, metric='cosine') knn.fit(X) joblib.dump(knn, 'models/proxy_finder.joblib') # ===================== # Functions # ===================== def predict_sales(new_product: dict) -> float: preprocessor = joblib.load('models/preprocessor.joblib') model = joblib.load('models/sales_predictor.joblib') df_new = pd.DataFrame([new_product]) df_new['description'] = df_new['description'].fillna('') X_new = preprocessor.transform(df_new) prediction = model.predict(X_new) return float(prediction[0]) def get_proxies(new_product: dict, top_n=5) -> list: preprocessor = joblib.load('models/preprocessor.joblib') knn = joblib.load('models/proxy_finder.joblib') df_new = pd.DataFrame([new_product]) df_new['description'] = df_new['description'].fillna('') X_new = preprocessor.transform(df_new) distances, indices = knn.kneighbors(X_new, n_neighbors=top_n) proxies = df.iloc[indices[0]].copy() proxies['distance'] = distances[0] return proxies.to_dict(orient='records') # ===================== # Example Usage # ===================== if __name__ == '__main__': new_product = { 'description': 'Wireless Noise Cancelling Headphones', 'brand': 'SoundCore', 'category': 'Electronics', 'subcategory': 'Headphones', 'price': 99.99 } sales_estimate = predict_sales(new_product) print(f"Predicted Sales: {sales_estimate}") proxies = get_proxies(new_product, top_n=5) for proxy in proxies: print(f"Proxy: {proxy['description']} | Brand: {proxy['brand']} | Sold: {proxy['sales_in_release']} | Distance: {proxy['distance']:.4f}")