In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.stats import anderson
from statsmodels.tsa.seasonal import STL
# Part 1: Data Importation, Inspection & Merging
paths = {
'Store_A': r'D:\Portfolio 2.0\Datasets\store_a_sales_data.csv',
'Store_B': r'D:\Portfolio 2.0\Datasets\store_b_sales_data.csv',
'Store_C': r'D:\Portfolio 2.0\Datasets\store_c_sales_data.csv',
'Pricing': r'D:\Portfolio 2.0\Datasets\product_pricing_data.csv'
}
# Load data
dfs = {k: pd.read_csv(v) for k, v in paths.items()}
# Clean names
for df in dfs.values():
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
# Melt sales
def melt_data(df, label):
m = df.melt(id_vars=['date','store_name'], var_name='product', value_name='units_sold')
m['store'] = label
return m
sales = pd.concat([
melt_data(dfs['Store_A'], 'Store_A'),
melt_data(dfs['Store_B'], 'Store_B'),
melt_data(dfs['Store_C'], 'Store_C')
], ignore_index=True)
# Merge pricing
sales['product'] = sales['product'].str.replace('product_','', regex=False).astype(int)
pricing = dfs['Pricing']
pricing['product'] = pricing['product'].str.replace('Product_','', regex=False).astype(int)
merged = sales.merge(pricing, on='product', how='left')
merged['date'] = pd.to_datetime(merged['date'])
# Data quality
print("Data Quality Checks:")
print(merged.info())
print(merged.describe())
print("Duplicates:", merged.duplicated().sum())
# Part 2: Distribution Analysis
print("\nDistribution Analysis (Anderson-Darling) & Visuals:")
for p in sorted(merged['product'].unique()):
vals = merged.loc[merged['product']==p,'units_sold']
res = anderson(vals)
print(f"Product {p}: stat={res.statistic:.2f}, crit@5%={res.critical_values[2]}")
# Histogram + KDE
plt.figure(figsize=(6,4))
sns.histplot(vals, kde=True)
plt.title(f'Units Sold Distribution - Product {p}')
plt.tight_layout(); plt.savefig(f'dist_hist_product_{p}.png'); plt.show()
# Boxplot
plt.figure(figsize=(6,2))
sns.boxplot(x=vals)
plt.title(f'Units Sold Boxplot - Product {p}')
plt.tight_layout(); plt.savefig(f'dist_box_product_{p}.png'); plt.show()
# Part 3: Summary Statistics
print("\nSummary Statistics & Visuals:")
pivot = merged.pivot_table(index='product', columns='store', values='units_sold', aggfunc=['sum','mean']).round(2)
print(pivot)
daily = merged.groupby('date')['units_sold'].agg(['sum','mean','std']).round(2)
print(daily.head())
# Store-level
store_sales = merged.groupby('store')['units_sold'].sum()
store_profit = merged.assign(profit=(merged['selling_price']-merged['cost_price'])*merged['units_sold']).groupby('store')['profit'].sum()
print("Store sales:\n", store_sales)
print("Store profit:\n", store_profit)
# Bar plots
plt.figure(figsize=(6,4)); store_sales.plot(kind='bar'); plt.title('Total Units Sold by Store'); plt.tight_layout(); plt.savefig('bar_store_sales.png'); plt.show()
plt.figure(figsize=(6,4)); store_profit.plot(kind='bar'); plt.title('Total Profit by Store'); plt.tight_layout(); plt.savefig('bar_store_profit.png'); plt.show()
# Most/Least profitable
profit_unit = pricing.assign(profit=pricing['selling_price']-pricing['cost_price'])
most_prof = profit_unit.loc[profit_unit['profit'].idxmax(),'product']
least_prof = profit_unit.loc[profit_unit['profit'].idxmin(),'product']
print(f"Most profitable product: {most_prof}, Least profitable product: {least_prof}")
# Part 4: Profitability & Polynomial Regression + Ridge
merged['profit'] = (merged['selling_price']-merged['cost_price'])*merged['units_sold']
print("\nCumulative Profit per Product & Store:")
print(merged.groupby(['product','store'])['profit'].sum())
def add_features(df):
df = df.copy()
df['day_num'] = (df['date']-df['date'].min()).dt.days
df['month_sin'] = np.sin(2*np.pi*df['date'].dt.month/12)
df['month_cos'] = np.cos(2*np.pi*df['date'].dt.month/12)
df['dow_sin'] = np.sin(2*np.pi*df['date'].dt.dayofweek/7)
df['dow_cos'] = np.cos(2*np.pi*df['date'].dt.dayofweek/7)
df['lag1'] = df['units_sold'].shift(1).bfill()
df['lag7'] = df['units_sold'].shift(7).bfill()
df['rolling7'] = df['units_sold'].rolling(7,min_periods=1).mean()
return df
results = {}
for p in sorted(merged['product'].unique()):
dfp = merged[merged['product']==p].groupby('date')['units_sold'].sum().reset_index()
dfp = add_features(dfp)
dfp['y_log'] = np.log1p(dfp['units_sold'])
feats = ['day_num','month_sin','month_cos','dow_sin','dow_cos','lag1','lag7','rolling7']
X, y = dfp[feats], dfp['y_log']
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
best_r2, best_deg = -np.inf, 2
best_rmse, best_mae = None, None
for deg in range(2,7):
poly = PolynomialFeatures(degree=deg, include_bias=False)
Xp = poly.fit_transform(Xs)
Xtr, Xte, ytr, yte = train_test_split(Xp, y, test_size=0.2, random_state=42)
model = Ridge(alpha=1.0).fit(Xtr, ytr)
ypred = np.expm1(model.predict(Xte))
ytrue = np.expm1(yte)
r2 = r2_score(ytrue, ypred)
rmse = np.sqrt(mean_squared_error(ytrue, ypred))
mae = mean_absolute_error(ytrue, ypred)
if r2 > best_r2:
best_r2, best_deg = r2, deg
best_rmse, best_mae = rmse, mae
# Fit final on full
poly_final = PolynomialFeatures(degree=best_deg, include_bias=False)
Xp_full = poly_final.fit_transform(Xs)
model_final = Ridge(alpha=1.0).fit(Xp_full, y)
preds_full = np.expm1(model_final.predict(Xp_full))
results[p] = {'r2': best_r2, 'deg': best_deg, 'pred_full': preds_full, 'actual': np.expm1(y)}
print(f"Product {p}: degree={best_deg}, R²={best_r2:.3f}, RMSE={best_rmse:.2f}, MAE={best_mae:.2f}")
# Forecast
resid_std = (results[p]['actual'] - preds_full).std()
last = dfp.iloc[-1]
future = pd.DataFrame({'date': pd.date_range(last['date']+pd.Timedelta(days=1), periods=30)})
future = add_features(future.assign(units_sold=last['units_sold']))
Xf = scaler.transform(future[feats])
Xpf = poly_final.transform(Xf)
future['forecast'] = np.expm1(model_final.predict(Xpf))
future['lower'] = future['forecast'] - 1.96*resid_std
future['upper'] = future['forecast'] + 1.96*resid_std
# Plot forecast
plt.figure(figsize=(8,4))
plt.plot(dfp['date'], dfp['units_sold'], label='Actual')
plt.plot(future['date'], future['forecast'], '--', label='Forecast')
plt.fill_between(future['date'], future['lower'], future['upper'], color='gray', alpha=0.2)
plt.title(f'Product {p} Sales Forecast')
plt.xlabel('Date'); plt.ylabel('Units Sold'); plt.legend(); plt.tight_layout()
plt.savefig(f'forecast_interval_product_{p}.png'); plt.show()
# Diagnostic: Actual vs Predicted
plt.figure(figsize=(6,6))
plt.scatter(results[p]['actual'], results[p]['pred_full'], alpha=0.5)
lims = [min(results[p]['actual'].min(), results[p]['pred_full'].min()),
max(results[p]['actual'].max(), results[p]['pred_full'].max())]
plt.plot(lims, lims, 'r--')
plt.xlabel('Actual Units Sold'); plt.ylabel('Predicted Units Sold')
plt.title(f'Actual vs Predicted - Product {p}'); plt.tight_layout()
plt.savefig(f'scatter_actual_pred_product_{p}.png'); plt.show()
# Diagnostics: Profit Time Series & Seasonality
daily_profit = merged.groupby('date')['profit'].sum()
plt.figure(figsize=(10,4)); daily_profit.plot(); plt.title('Daily Profit Over Time')
plt.xlabel('Date'); plt.ylabel('Profit'); plt.tight_layout()
plt.savefig('daily_profit_timeseries.png'); plt.show()
df_month = merged.copy()
df_month['month'] = df_month['date'].dt.to_period('M')
monthly_sales = df_month.groupby('month')['units_sold'].sum()
monthly_profit = df_month.groupby('month')['profit'].sum()
plt.figure(figsize=(10,4))
monthly_sales.plot(marker='o', label='Sales')
monthly_profit.plot(marker='o', label='Profit')
plt.title('Monthly Sales vs Profit')
plt.xlabel('Month'); plt.ylabel('Amount')
plt.legend(); plt.tight_layout()
plt.savefig('monthly_sales_profit.png'); plt.show()
stl = STL(daily_profit, period=30).fit()
_ = stl.plot(); plt.tight_layout()
plt.savefig('stl_decomposition.png'); plt.show()
print("STL Decomposition Insights:")
print(f"Trend strength: {stl.trend.std() / daily_profit.std():.2f}")
print(f"Seasonality strength: {stl.seasonal.std() / daily_profit.std():.2f}")
# Part 5: Insights & Recommendations
# Store profitability ranking
store_profitability = merged.groupby('store')['profit'].sum().sort_values(ascending=False)
print("\nStore profitability ranking:\n", store_profitability)
# Product sales ranking
product_sales_ranking = merged.groupby('product')['units_sold'].sum().sort_values(ascending=False)
print("\nProduct sales ranking:\n", product_sales_ranking)
# Identify the best-performing product model based on R²
best_product = max(results, key=lambda p: results[p]['r2'])
best_r2 = results[best_product]['r2']
print(f"\nBest model: Product {best_product}, R²={best_r2:.3f}")
# STL Decomposition Insights
print("\nSTL Decomposition Insights:")
print(f"Trend strength: {stl.trend.std() / daily_profit.std():.2f}")
print(f"Seasonality strength: {stl.seasonal.std() / daily_profit.std():.2f}")
# Forecasting Future Sales for the Best Performing Product
# Prepare last product dataframe
dfp_best = merged[merged['product'] == best_product].groupby('date')['units_sold'].sum().reset_index()
dfp_best = add_features(dfp_best)
dfp_best['y_log'] = np.log1p(dfp_best['units_sold'])
# Prepare feature names used
feats = ['day_num', 'month_sin', 'month_cos', 'dow_sin', 'dow_cos', 'lag1', 'lag7', 'rolling7']
# Fit scaler and polynomial transformer again (optional: you can save from earlier steps)
scaler = StandardScaler()
Xs_best = scaler.fit_transform(dfp_best[feats])
poly = PolynomialFeatures(degree=results[best_product]['deg'], include_bias=False)
Xp_best = poly.fit_transform(Xs_best)
# Train Ridge model again
model_final = Ridge(alpha=1.0)
model_final.fit(Xp_best, dfp_best['y_log'])
# Forecast future dates (next 30 days)
future_dates = pd.date_range(start=dfp_best['date'].max() + pd.Timedelta(days=1), periods=30)
future = pd.DataFrame({'date': future_dates})
# Generate future features
future = add_features(future.assign(units_sold=dfp_best['units_sold'].iloc[-1]))
X_future = scaler.transform(future[feats])
Xpf_future = poly.transform(X_future)
# Predict
future_predictions = np.expm1(model_final.predict(Xpf_future))
# Create future forecast DataFrame
future_forecast = pd.DataFrame({'Date': future_dates, 'Predicted Units Sold': future_predictions})
print("\nFuture Sales Forecast for Best Product (Next 30 Days):\n", future_forecast)
# Plot historical and forecasted units sold
plt.figure(figsize=(12,6))
plt.plot(dfp_best['date'], dfp_best['units_sold'], label='Historical Units Sold')
plt.plot(future_forecast['Date'], future_forecast['Predicted Units Sold'], linestyle='--', label='Forecasted Units Sold')
plt.xlabel('Date')
plt.ylabel('Units Sold')
plt.title(f'Forecasted Units Sold for Product {best_product} (Next 30 Days)')
plt.legend()
plt.tight_layout()
plt.show()
# Recommendations
print("\nRecommendations:")
print("- Increase inventory stock for Product 1 (highest sales volume) to meet high customer demand.")
print("- Implement dynamic inventory management for top 3 products and top 2 stores based on sales and profitability rankings.")
print(f"- Reassess pricing strategy for Product {least_prof} (lowest profit margin) to improve overall profitability.")
print("- Continue using polynomial Ridge regression models with lag, rolling mean, and cyclical features for accurate forecasting.")
print("- Perform hyperparameter tuning (e.g., adjusting Ridge alpha) and consider integrating external factors such as promotions, holidays, and economic indicators to improve forecast precision.")
Data Quality Checks: <class 'pandas.core.frame.DataFrame'> RangeIndex: 19710 entries, 0 to 19709 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 19710 non-null datetime64[ns] 1 store_name 19710 non-null object 2 product 19710 non-null int64 3 units_sold 19710 non-null int64 4 store 19710 non-null object 5 cost_price 19710 non-null float64 6 selling_price 19710 non-null float64 dtypes: datetime64[ns](1), float64(2), int64(2), object(2) memory usage: 1.1+ MB None date product units_sold cost_price \ count 19710 19710.000000 19710.000000 19710.000000 mean 2022-07-02 00:00:00 3.500000 82.050127 24.865000 min 2021-01-01 00:00:00 1.000000 0.000000 6.340000 25% 2021-10-01 00:00:00 2.000000 34.000000 8.470000 50% 2022-07-02 00:00:00 3.500000 64.000000 22.610000 75% 2023-04-02 00:00:00 5.000000 114.000000 39.510000 max 2023-12-31 00:00:00 6.000000 299.000000 49.650000 std NaN 1.707868 65.529818 17.394952 selling_price count 19710.000000 mean 53.823333 min 20.940000 25% 25.160000 50% 55.805000 75% 80.680000 max 84.550000 std 27.508899 Duplicates: 0 Distribution Analysis (Anderson-Darling) & Visuals: Product 1: stat=39.42, crit@5%=0.786
Product 2: stat=37.15, crit@5%=0.786
Product 3: stat=37.81, crit@5%=0.786
Product 4: stat=40.97, crit@5%=0.786
Product 5: stat=39.79, crit@5%=0.786
Product 6: stat=42.41, crit@5%=0.786
Summary Statistics & Visuals: sum mean store Store_A Store_B Store_C Store_A Store_B Store_C product 1 193593 189893 189861 176.80 173.42 173.39 2 123677 123422 124330 112.95 112.71 113.54 3 91783 92962 91723 83.82 84.90 83.77 4 59878 59696 58724 54.68 54.52 53.63 5 46836 45038 46040 42.77 41.13 42.05 6 26623 26525 26604 24.31 24.22 24.30 sum mean std date 2021-01-01 1312 72.89 58.75 2021-01-02 1648 91.56 93.91 2021-01-03 1771 98.39 66.30 2021-01-04 1646 91.44 71.33 2021-01-05 1573 87.39 65.00 Store sales: store Store_A 542390 Store_B 537536 Store_C 537282 Name: units_sold, dtype: int64 Store profit: store Store_A 16138784.98 Store_B 15939994.94 Store_C 15951199.23 Name: profit, dtype: float64
Most profitable product: 1, Least profitable product: 4 Cumulative Profit per Product & Store: product store 1 Store_A 8063148.45 Store_B 7909043.45 Store_C 7907710.65 2 Store_A 2327601.14 Store_B 2322802.04 Store_C 2339890.60 3 Store_A 2270711.42 Store_B 2299879.88 Store_C 2269227.02 4 Store_A 746678.66 Store_B 744409.12 Store_C 732288.28 5 Store_A 1634576.40 Store_B 1571826.20 Store_C 1606796.00 6 Store_A 1096068.91 Store_B 1092034.25 Store_C 1095286.68 Name: profit, dtype: float64 Product 1: degree=2, R²=0.138, RMSE=125.45, MAE=104.81
Product 2: degree=2, R²=0.069, RMSE=85.60, MAE=70.17
Product 3: degree=2, R²=0.108, RMSE=66.14, MAE=54.01
Product 4: degree=2, R²=0.036, RMSE=40.72, MAE=31.72
Product 5: degree=2, R²=0.106, RMSE=39.08, MAE=31.64
Product 6: degree=2, R²=0.173, RMSE=22.51, MAE=18.13
STL Decomposition Insights: Trend strength: 0.17 Seasonality strength: 0.51 Store profitability ranking: store Store_A 16138784.98 Store_C 15951199.23 Store_B 15939994.94 Name: profit, dtype: float64 Product sales ranking: product 1 573347 2 371429 3 276468 4 178298 5 137914 6 79752 Name: units_sold, dtype: int64 Best model: Product 6, R²=0.173 STL Decomposition Insights: Trend strength: 0.17 Seasonality strength: 0.51 Future Sales Forecast for Best Product (Next 30 Days): Date Predicted Units Sold 0 2024-01-01 46.150124 1 2024-01-02 48.842279 2 2024-01-03 52.531187 3 2024-01-04 51.455875 4 2024-01-05 47.514776 5 2024-01-06 46.018128 6 2024-01-07 46.032916 7 2024-01-08 46.189039 8 2024-01-09 48.863619 9 2024-01-10 52.526058 10 2024-01-11 51.437373 11 2024-01-12 47.507503 12 2024-01-13 46.035028 13 2024-01-14 46.070131 14 2024-01-15 46.227713 15 2024-01-16 48.884679 16 2024-01-17 52.520618 17 2024-01-18 51.418573 18 2024-01-19 47.499950 19 2024-01-20 46.051659 20 2024-01-21 46.107101 21 2024-01-22 46.266144 22 2024-01-23 48.905458 23 2024-01-24 52.514868 24 2024-01-25 51.399476 25 2024-01-26 47.492116 26 2024-01-27 46.068024 27 2024-01-28 46.143827 28 2024-01-29 46.304332 29 2024-01-30 48.925956
Recommendations: - Increase inventory stock for Product 1 (highest sales volume) to meet high customer demand. - Implement dynamic inventory management for top 3 products and top 2 stores based on sales and profitability rankings. - Reassess pricing strategy for Product 4 (lowest profit margin) to improve overall profitability. - Continue using polynomial Ridge regression models with lag, rolling mean, and cyclical features for accurate forecasting. - Perform hyperparameter tuning (e.g., adjusting Ridge alpha) and consider integrating external factors such as promotions, holidays, and economic indicators to improve forecast precision.