In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.stats import anderson
from statsmodels.tsa.seasonal import STL


# Part 1: Data Importation, Inspection & Merging 
paths = {
    'Store_A': r'D:\Portfolio 2.0\Datasets\store_a_sales_data.csv',
    'Store_B': r'D:\Portfolio 2.0\Datasets\store_b_sales_data.csv',
    'Store_C': r'D:\Portfolio 2.0\Datasets\store_c_sales_data.csv',
    'Pricing': r'D:\Portfolio 2.0\Datasets\product_pricing_data.csv'
}


# Load data
dfs = {k: pd.read_csv(v) for k, v in paths.items()}
# Clean names
for df in dfs.values():
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
# Melt sales
def melt_data(df, label):
    m = df.melt(id_vars=['date','store_name'], var_name='product', value_name='units_sold')
    m['store'] = label
    return m
sales = pd.concat([
    melt_data(dfs['Store_A'], 'Store_A'),
    melt_data(dfs['Store_B'], 'Store_B'),
    melt_data(dfs['Store_C'], 'Store_C')
], ignore_index=True)
# Merge pricing
sales['product'] = sales['product'].str.replace('product_','', regex=False).astype(int)
pricing = dfs['Pricing']
pricing['product'] = pricing['product'].str.replace('Product_','', regex=False).astype(int)
merged = sales.merge(pricing, on='product', how='left')
merged['date'] = pd.to_datetime(merged['date'])
# Data quality
print("Data Quality Checks:")
print(merged.info())
print(merged.describe())
print("Duplicates:", merged.duplicated().sum())


# Part 2: Distribution Analysis
print("\nDistribution Analysis (Anderson-Darling) & Visuals:")
for p in sorted(merged['product'].unique()):
    vals = merged.loc[merged['product']==p,'units_sold']
    res = anderson(vals)
    print(f"Product {p}: stat={res.statistic:.2f}, crit@5%={res.critical_values[2]}")
    # Histogram + KDE
    plt.figure(figsize=(6,4))
    sns.histplot(vals, kde=True)
    plt.title(f'Units Sold Distribution - Product {p}')
    plt.tight_layout(); plt.savefig(f'dist_hist_product_{p}.png'); plt.show()
    # Boxplot
    plt.figure(figsize=(6,2))
    sns.boxplot(x=vals)
    plt.title(f'Units Sold Boxplot - Product {p}')
    plt.tight_layout(); plt.savefig(f'dist_box_product_{p}.png'); plt.show()


# Part 3: Summary Statistics
print("\nSummary Statistics & Visuals:")
pivot = merged.pivot_table(index='product', columns='store', values='units_sold', aggfunc=['sum','mean']).round(2)
print(pivot)
daily = merged.groupby('date')['units_sold'].agg(['sum','mean','std']).round(2)
print(daily.head())
# Store-level
store_sales = merged.groupby('store')['units_sold'].sum()
store_profit = merged.assign(profit=(merged['selling_price']-merged['cost_price'])*merged['units_sold']).groupby('store')['profit'].sum()
print("Store sales:\n", store_sales)
print("Store profit:\n", store_profit)
# Bar plots
plt.figure(figsize=(6,4)); store_sales.plot(kind='bar'); plt.title('Total Units Sold by Store'); plt.tight_layout(); plt.savefig('bar_store_sales.png'); plt.show()
plt.figure(figsize=(6,4)); store_profit.plot(kind='bar'); plt.title('Total Profit by Store'); plt.tight_layout(); plt.savefig('bar_store_profit.png'); plt.show()
# Most/Least profitable
profit_unit = pricing.assign(profit=pricing['selling_price']-pricing['cost_price'])
most_prof = profit_unit.loc[profit_unit['profit'].idxmax(),'product']
least_prof = profit_unit.loc[profit_unit['profit'].idxmin(),'product']
print(f"Most profitable product: {most_prof}, Least profitable product: {least_prof}")


# Part 4: Profitability & Polynomial Regression + Ridge
merged['profit'] = (merged['selling_price']-merged['cost_price'])*merged['units_sold']
print("\nCumulative Profit per Product & Store:")
print(merged.groupby(['product','store'])['profit'].sum())

def add_features(df):
    df = df.copy()
    df['day_num'] = (df['date']-df['date'].min()).dt.days
    df['month_sin'] = np.sin(2*np.pi*df['date'].dt.month/12)
    df['month_cos'] = np.cos(2*np.pi*df['date'].dt.month/12)
    df['dow_sin'] = np.sin(2*np.pi*df['date'].dt.dayofweek/7)
    df['dow_cos'] = np.cos(2*np.pi*df['date'].dt.dayofweek/7)
    df['lag1'] = df['units_sold'].shift(1).bfill()
    df['lag7'] = df['units_sold'].shift(7).bfill()
    df['rolling7'] = df['units_sold'].rolling(7,min_periods=1).mean()
    return df

results = {}
for p in sorted(merged['product'].unique()):
    dfp = merged[merged['product']==p].groupby('date')['units_sold'].sum().reset_index()
    dfp = add_features(dfp)
    dfp['y_log'] = np.log1p(dfp['units_sold'])
    feats = ['day_num','month_sin','month_cos','dow_sin','dow_cos','lag1','lag7','rolling7']
    X, y = dfp[feats], dfp['y_log']
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)

    best_r2, best_deg = -np.inf, 2
    best_rmse, best_mae = None, None

    for deg in range(2,7):
        poly = PolynomialFeatures(degree=deg, include_bias=False)
        Xp = poly.fit_transform(Xs)
        Xtr, Xte, ytr, yte = train_test_split(Xp, y, test_size=0.2, random_state=42)
        model = Ridge(alpha=1.0).fit(Xtr, ytr)
        ypred = np.expm1(model.predict(Xte))
        ytrue = np.expm1(yte)
        r2 = r2_score(ytrue, ypred)
        rmse = np.sqrt(mean_squared_error(ytrue, ypred))
        mae  = mean_absolute_error(ytrue, ypred)
        if r2 > best_r2:
            best_r2, best_deg = r2, deg
            best_rmse, best_mae = rmse, mae

    # Fit final on full
    poly_final = PolynomialFeatures(degree=best_deg, include_bias=False)
    Xp_full = poly_final.fit_transform(Xs)
    model_final = Ridge(alpha=1.0).fit(Xp_full, y)
    preds_full = np.expm1(model_final.predict(Xp_full))
    results[p] = {'r2': best_r2, 'deg': best_deg, 'pred_full': preds_full, 'actual': np.expm1(y)}

    print(f"Product {p}: degree={best_deg}, R²={best_r2:.3f}, RMSE={best_rmse:.2f}, MAE={best_mae:.2f}")

    # Forecast
    resid_std = (results[p]['actual'] - preds_full).std()
    last = dfp.iloc[-1]
    future = pd.DataFrame({'date': pd.date_range(last['date']+pd.Timedelta(days=1), periods=30)})
    future = add_features(future.assign(units_sold=last['units_sold']))
    Xf = scaler.transform(future[feats])
    Xpf = poly_final.transform(Xf)
    future['forecast'] = np.expm1(model_final.predict(Xpf))
    future['lower'] = future['forecast'] - 1.96*resid_std
    future['upper'] = future['forecast'] + 1.96*resid_std

    # Plot forecast
    plt.figure(figsize=(8,4))
    plt.plot(dfp['date'], dfp['units_sold'], label='Actual')
    plt.plot(future['date'], future['forecast'], '--', label='Forecast')
    plt.fill_between(future['date'], future['lower'], future['upper'], color='gray', alpha=0.2)
    plt.title(f'Product {p} Sales Forecast')
    plt.xlabel('Date'); plt.ylabel('Units Sold'); plt.legend(); plt.tight_layout()
    plt.savefig(f'forecast_interval_product_{p}.png'); plt.show()

    # Diagnostic: Actual vs Predicted
    plt.figure(figsize=(6,6))
    plt.scatter(results[p]['actual'], results[p]['pred_full'], alpha=0.5)
    lims = [min(results[p]['actual'].min(), results[p]['pred_full'].min()),
            max(results[p]['actual'].max(), results[p]['pred_full'].max())]
    plt.plot(lims, lims, 'r--')
    plt.xlabel('Actual Units Sold'); plt.ylabel('Predicted Units Sold')
    plt.title(f'Actual vs Predicted - Product {p}'); plt.tight_layout()
    plt.savefig(f'scatter_actual_pred_product_{p}.png'); plt.show()


# Diagnostics: Profit Time Series & Seasonality
daily_profit = merged.groupby('date')['profit'].sum()
plt.figure(figsize=(10,4)); daily_profit.plot(); plt.title('Daily Profit Over Time')
plt.xlabel('Date'); plt.ylabel('Profit'); plt.tight_layout()
plt.savefig('daily_profit_timeseries.png'); plt.show()

df_month = merged.copy()
df_month['month'] = df_month['date'].dt.to_period('M')
monthly_sales = df_month.groupby('month')['units_sold'].sum()
monthly_profit = df_month.groupby('month')['profit'].sum()
plt.figure(figsize=(10,4))
monthly_sales.plot(marker='o', label='Sales')
monthly_profit.plot(marker='o', label='Profit')
plt.title('Monthly Sales vs Profit')
plt.xlabel('Month'); plt.ylabel('Amount')
plt.legend(); plt.tight_layout()
plt.savefig('monthly_sales_profit.png'); plt.show()

stl = STL(daily_profit, period=30).fit()
_ = stl.plot(); plt.tight_layout()
plt.savefig('stl_decomposition.png'); plt.show()
print("STL Decomposition Insights:")
print(f"Trend strength: {stl.trend.std() / daily_profit.std():.2f}")
print(f"Seasonality strength: {stl.seasonal.std() / daily_profit.std():.2f}")


# Part 5: Insights & Recommendations

# Store profitability ranking
store_profitability = merged.groupby('store')['profit'].sum().sort_values(ascending=False)
print("\nStore profitability ranking:\n", store_profitability)

# Product sales ranking
product_sales_ranking = merged.groupby('product')['units_sold'].sum().sort_values(ascending=False)
print("\nProduct sales ranking:\n", product_sales_ranking)

# Identify the best-performing product model based on R²
best_product = max(results, key=lambda p: results[p]['r2'])
best_r2 = results[best_product]['r2']
print(f"\nBest model: Product {best_product}, R²={best_r2:.3f}")

# STL Decomposition Insights
print("\nSTL Decomposition Insights:")
print(f"Trend strength: {stl.trend.std() / daily_profit.std():.2f}")
print(f"Seasonality strength: {stl.seasonal.std() / daily_profit.std():.2f}")


# Forecasting Future Sales for the Best Performing Product
# Prepare last product dataframe
dfp_best = merged[merged['product'] == best_product].groupby('date')['units_sold'].sum().reset_index()
dfp_best = add_features(dfp_best)
dfp_best['y_log'] = np.log1p(dfp_best['units_sold'])

# Prepare feature names used
feats = ['day_num', 'month_sin', 'month_cos', 'dow_sin', 'dow_cos', 'lag1', 'lag7', 'rolling7']

# Fit scaler and polynomial transformer again (optional: you can save from earlier steps)
scaler = StandardScaler()
Xs_best = scaler.fit_transform(dfp_best[feats])

poly = PolynomialFeatures(degree=results[best_product]['deg'], include_bias=False)
Xp_best = poly.fit_transform(Xs_best)

# Train Ridge model again
model_final = Ridge(alpha=1.0)
model_final.fit(Xp_best, dfp_best['y_log'])

# Forecast future dates (next 30 days)
future_dates = pd.date_range(start=dfp_best['date'].max() + pd.Timedelta(days=1), periods=30)
future = pd.DataFrame({'date': future_dates})

# Generate future features
future = add_features(future.assign(units_sold=dfp_best['units_sold'].iloc[-1]))
X_future = scaler.transform(future[feats])
Xpf_future = poly.transform(X_future)

# Predict
future_predictions = np.expm1(model_final.predict(Xpf_future))

# Create future forecast DataFrame
future_forecast = pd.DataFrame({'Date': future_dates, 'Predicted Units Sold': future_predictions})
print("\nFuture Sales Forecast for Best Product (Next 30 Days):\n", future_forecast)

# Plot historical and forecasted units sold
plt.figure(figsize=(12,6))
plt.plot(dfp_best['date'], dfp_best['units_sold'], label='Historical Units Sold')
plt.plot(future_forecast['Date'], future_forecast['Predicted Units Sold'], linestyle='--', label='Forecasted Units Sold')
plt.xlabel('Date')
plt.ylabel('Units Sold')
plt.title(f'Forecasted Units Sold for Product {best_product} (Next 30 Days)')
plt.legend()
plt.tight_layout()
plt.show()

# Recommendations

print("\nRecommendations:")
print("- Increase inventory stock for Product 1 (highest sales volume) to meet high customer demand.")
print("- Implement dynamic inventory management for top 3 products and top 2 stores based on sales and profitability rankings.")
print(f"- Reassess pricing strategy for Product {least_prof} (lowest profit margin) to improve overall profitability.")
print("- Continue using polynomial Ridge regression models with lag, rolling mean, and cyclical features for accurate forecasting.")
print("- Perform hyperparameter tuning (e.g., adjusting Ridge alpha) and consider integrating external factors such as promotions, holidays, and economic indicators to improve forecast precision.")
Data Quality Checks:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19710 entries, 0 to 19709
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           19710 non-null  datetime64[ns]
 1   store_name     19710 non-null  object        
 2   product        19710 non-null  int64         
 3   units_sold     19710 non-null  int64         
 4   store          19710 non-null  object        
 5   cost_price     19710 non-null  float64       
 6   selling_price  19710 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 1.1+ MB
None
                      date       product    units_sold    cost_price  \
count                19710  19710.000000  19710.000000  19710.000000   
mean   2022-07-02 00:00:00      3.500000     82.050127     24.865000   
min    2021-01-01 00:00:00      1.000000      0.000000      6.340000   
25%    2021-10-01 00:00:00      2.000000     34.000000      8.470000   
50%    2022-07-02 00:00:00      3.500000     64.000000     22.610000   
75%    2023-04-02 00:00:00      5.000000    114.000000     39.510000   
max    2023-12-31 00:00:00      6.000000    299.000000     49.650000   
std                    NaN      1.707868     65.529818     17.394952   

       selling_price  
count   19710.000000  
mean       53.823333  
min        20.940000  
25%        25.160000  
50%        55.805000  
75%        80.680000  
max        84.550000  
std        27.508899  
Duplicates: 0

Distribution Analysis (Anderson-Darling) & Visuals:
Product 1: stat=39.42, crit@5%=0.786
No description has been provided for this image
No description has been provided for this image
Product 2: stat=37.15, crit@5%=0.786
No description has been provided for this image
No description has been provided for this image
Product 3: stat=37.81, crit@5%=0.786
No description has been provided for this image
No description has been provided for this image
Product 4: stat=40.97, crit@5%=0.786
No description has been provided for this image
No description has been provided for this image
Product 5: stat=39.79, crit@5%=0.786
No description has been provided for this image
No description has been provided for this image
Product 6: stat=42.41, crit@5%=0.786
No description has been provided for this image
No description has been provided for this image
Summary Statistics & Visuals:
            sum                    mean                
store   Store_A Store_B Store_C Store_A Store_B Store_C
product                                                
1        193593  189893  189861  176.80  173.42  173.39
2        123677  123422  124330  112.95  112.71  113.54
3         91783   92962   91723   83.82   84.90   83.77
4         59878   59696   58724   54.68   54.52   53.63
5         46836   45038   46040   42.77   41.13   42.05
6         26623   26525   26604   24.31   24.22   24.30
             sum   mean    std
date                          
2021-01-01  1312  72.89  58.75
2021-01-02  1648  91.56  93.91
2021-01-03  1771  98.39  66.30
2021-01-04  1646  91.44  71.33
2021-01-05  1573  87.39  65.00
Store sales:
 store
Store_A    542390
Store_B    537536
Store_C    537282
Name: units_sold, dtype: int64
Store profit:
 store
Store_A    16138784.98
Store_B    15939994.94
Store_C    15951199.23
Name: profit, dtype: float64
No description has been provided for this image
No description has been provided for this image
Most profitable product: 1, Least profitable product: 4

Cumulative Profit per Product & Store:
product  store  
1        Store_A    8063148.45
         Store_B    7909043.45
         Store_C    7907710.65
2        Store_A    2327601.14
         Store_B    2322802.04
         Store_C    2339890.60
3        Store_A    2270711.42
         Store_B    2299879.88
         Store_C    2269227.02
4        Store_A     746678.66
         Store_B     744409.12
         Store_C     732288.28
5        Store_A    1634576.40
         Store_B    1571826.20
         Store_C    1606796.00
6        Store_A    1096068.91
         Store_B    1092034.25
         Store_C    1095286.68
Name: profit, dtype: float64
Product 1: degree=2, R²=0.138, RMSE=125.45, MAE=104.81
No description has been provided for this image
No description has been provided for this image
Product 2: degree=2, R²=0.069, RMSE=85.60, MAE=70.17
No description has been provided for this image
No description has been provided for this image
Product 3: degree=2, R²=0.108, RMSE=66.14, MAE=54.01
No description has been provided for this image
No description has been provided for this image
Product 4: degree=2, R²=0.036, RMSE=40.72, MAE=31.72
No description has been provided for this image
No description has been provided for this image
Product 5: degree=2, R²=0.106, RMSE=39.08, MAE=31.64
No description has been provided for this image
No description has been provided for this image
Product 6: degree=2, R²=0.173, RMSE=22.51, MAE=18.13
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
STL Decomposition Insights:
Trend strength: 0.17
Seasonality strength: 0.51

Store profitability ranking:
 store
Store_A    16138784.98
Store_C    15951199.23
Store_B    15939994.94
Name: profit, dtype: float64

Product sales ranking:
 product
1    573347
2    371429
3    276468
4    178298
5    137914
6     79752
Name: units_sold, dtype: int64

Best model: Product 6, R²=0.173

STL Decomposition Insights:
Trend strength: 0.17
Seasonality strength: 0.51

Future Sales Forecast for Best Product (Next 30 Days):
          Date  Predicted Units Sold
0  2024-01-01             46.150124
1  2024-01-02             48.842279
2  2024-01-03             52.531187
3  2024-01-04             51.455875
4  2024-01-05             47.514776
5  2024-01-06             46.018128
6  2024-01-07             46.032916
7  2024-01-08             46.189039
8  2024-01-09             48.863619
9  2024-01-10             52.526058
10 2024-01-11             51.437373
11 2024-01-12             47.507503
12 2024-01-13             46.035028
13 2024-01-14             46.070131
14 2024-01-15             46.227713
15 2024-01-16             48.884679
16 2024-01-17             52.520618
17 2024-01-18             51.418573
18 2024-01-19             47.499950
19 2024-01-20             46.051659
20 2024-01-21             46.107101
21 2024-01-22             46.266144
22 2024-01-23             48.905458
23 2024-01-24             52.514868
24 2024-01-25             51.399476
25 2024-01-26             47.492116
26 2024-01-27             46.068024
27 2024-01-28             46.143827
28 2024-01-29             46.304332
29 2024-01-30             48.925956
No description has been provided for this image
Recommendations:
- Increase inventory stock for Product 1 (highest sales volume) to meet high customer demand.
- Implement dynamic inventory management for top 3 products and top 2 stores based on sales and profitability rankings.
- Reassess pricing strategy for Product 4 (lowest profit margin) to improve overall profitability.
- Continue using polynomial Ridge regression models with lag, rolling mean, and cyclical features for accurate forecasting.
- Perform hyperparameter tuning (e.g., adjusting Ridge alpha) and consider integrating external factors such as promotions, holidays, and economic indicators to improve forecast precision.