# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# import for statistical testing
from scipy import stats

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

# read the data
df = pd.read_csv('foodhub_order.csv')
# returns the first 5 rows
df.head()

# Replace 'Not given' with NaN in the rating column
df['rating'] = df['rating'].replace('Not given', np.nan)

# Convert rating to numeric type for calculations
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

print("Data cleaning completed:")
print(f"- Rating column converted: 'Not given' → NaN")
print(f"- Rating data type: {df['rating'].dtype}")
print(f"- Missing ratings: {df['rating'].isnull().sum()}")

Data cleaning completed:
- Rating column converted: 'Not given' → NaN
- Rating data type: float64
- Missing ratings: 736

# Check the shape of the dataframe
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"\nDataFrame shape: {df.shape}")

Number of rows: 1898
Number of columns: 9

DataFrame shape: (1898, 9)

# Use info() to print a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1162 non-null   float64
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 133.6+ KB

# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

# Percentage of missing values
print("\nPercentage of missing values:")
missing_pct = (df.isnull().sum() / len(df)) * 100
print(missing_pct[missing_pct > 0].apply(lambda x: f"{x:.2f} %"))

# Decision: Keep NaN values in rating column as they represent meaningful information
# (orders that were not rated by customers)
print("\n" + "="*70)
print("Treatment Decision:")
print("="*70)
print("The missing values in the rating column will be kept as NaN.")
print("Rationale: These represent orders that customers chose not to rate,")
print("which is meaningful business information. Imputing these values")
print("(e.g., with mean/median) would misrepresent customer satisfaction.")

Missing values in each column:
order_id                   0
customer_id                0
restaurant_name            0
cuisine_type               0
cost_of_the_order          0
day_of_the_week            0
rating                   736
food_preparation_time      0
delivery_time              0
dtype: int64

Total missing values: 736

Percentage of missing values:
rating    38.78 %
dtype: object

======================================================================
Treatment Decision:
======================================================================
The missing values in the rating column will be kept as NaN.
Rationale: These represent orders that customers chose not to rate,
which is meaningful business information. Imputing these values
(e.g., with mean/median) would misrepresent customer satisfaction.

# Statistical summary of the data
print("Statistical Summary of the Dataset:")
print("="*80)
print(df.describe())

# Specific statistics for food preparation time
print("\n" + "="*80)
print("Food Preparation Time Statistics:")
print("="*80)
print(f"Minimum preparation time: {df['food_preparation_time'].min()} minutes")
print(f"Average preparation time: {df['food_preparation_time'].mean():.2f} minutes")
print(f"Maximum preparation time: {df['food_preparation_time'].max()} minutes")
print(f"Median preparation time: {df['food_preparation_time'].median()} minutes")
print(f"Standard deviation: {df['food_preparation_time'].std():.2f} minutes")

Statistical Summary of the Dataset:
================================================================================
           order_id    customer_id  cost_of_the_order       rating  \
count  1.898000e+03    1898.000000        1898.000000  1162.000000   
mean   1.477496e+06  171168.478398          16.498851     4.344234   
std    5.480497e+02  113698.139743           7.483812     0.741478   
min    1.476547e+06    1311.000000           4.470000     3.000000   
25%    1.477021e+06   77787.750000          12.080000     4.000000   
50%    1.477496e+06  128600.000000          14.140000     5.000000   
75%    1.477970e+06  270525.000000          22.297500     5.000000   
max    1.478444e+06  405334.000000          35.410000     5.000000   

       food_preparation_time  delivery_time  
count            1898.000000    1898.000000  
mean               27.371970      24.161749  
std                 4.632481       4.972637  
min                20.000000      15.000000  
25%                23.000000      20.000000  
50%                27.000000      25.000000  
75%                31.000000      28.000000  
max                35.000000      33.000000  

================================================================================
Food Preparation Time Statistics:
================================================================================
Minimum preparation time: 20 minutes
Average preparation time: 27.37 minutes
Maximum preparation time: 35 minutes
Median preparation time: 27.0 minutes
Standard deviation: 4.63 minutes

# Outlier check using IQR (Interquartile Range)
numeric_cols = ['cost_of_the_order', 'food_preparation_time', 'delivery_time']
outlier_summary = []

for col in numeric_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr

    outliers = df[(df[col] < lower) | (df[col] > upper)]
    outlier_summary.append({
        "column": col,
        "Q1": q1,
        "Q3": q3,
        "IQR": iqr,
        "lower_bound": lower,
        "upper_bound": upper,
        "outlier_count": len(outliers),
        "outlier_pct": round(len(outliers) / len(df) * 100, 2),
        "min": df[col].min(),
        "max": df[col].max()
    })

pd.DataFrame(outlier_summary)

# Clean rating column: convert to numeric (NaN for "Not given")
df['rating_num'] = pd.to_numeric(df['rating'].replace('Not given', np.nan))

rating_rate = df['rating_num'].notna().mean() * 100
avg_rating = df['rating_num'].mean()

print(f"Rating rate (non-missing): {rating_rate:.1f}%")
print(f"Average rating (rated orders only): {avg_rating:.2f}")

Rating rate (non-missing): 61.2%
Average rating (rated orders only): 4.34

def bootstrap_ci(series, n_boot=5000, ci=0.95, seed=42):
    rng = np.random.default_rng(seed)
    arr = np.asarray(series.dropna())
    boots = rng.choice(arr, size=(n_boot, len(arr)), replace=True).mean(axis=1)
    alpha = (1 - ci) / 2
    return np.mean(arr), np.quantile(boots, alpha), np.quantile(boots, 1 - alpha)

kpis = []
for col in ['delivery_time', 'food_preparation_time', 'cost_of_the_order']:
    mean, lo, hi = bootstrap_ci(df[col])
    kpis.append({"metric": f"Mean {col}", "mean": round(mean, 2), "CI_low": round(lo, 2), "CI_high": round(hi, 2)})

pd.DataFrame(kpis)

# Count orders without ratings (already converted to NaN in data cleaning)
unrated_orders = df['rating'].isnull().sum()
total_orders = len(df)
percentage_unrated = (unrated_orders / total_orders) * 100

print(f"Number of orders not rated: {unrated_orders}")
print(f"Total orders: {total_orders}")
print(f"Percentage of unrated orders: {percentage_unrated:.2f}%")

# Additional insight
rated_orders = df['rating'].notna().sum()
print(f"\nNumber of rated orders: {rated_orders}")
print(f"Percentage of rated orders: {(rated_orders / total_orders) * 100:.2f}%")

Number of orders not rated: 736
Total orders: 1898
Percentage of unrated orders: 38.78%

Number of rated orders: 1162
Percentage of rated orders: 61.22%

# 1. Distribution of Order Cost
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(data=df, x='cost_of_the_order', bins=30, kde=True,
             color='skyblue', edgecolor='black')
plt.axvline(df['cost_of_the_order'].mean(), color='red', linestyle='--',
            linewidth=2, label=f'Mean: ${df["cost_of_the_order"].mean():.2f}')
plt.axvline(df['cost_of_the_order'].median(), color='green', linestyle='--',
            linewidth=2, alpha=0.7, label=f'Median: ${df["cost_of_the_order"].median():.2f}')
plt.xlabel('Cost of Order ($)')
plt.ylabel('Frequency')
plt.title('Distribution of Order Cost')
plt.legend()
plt.grid(alpha=0.3)
# Chart choice: Histogram shows distribution shape and skewness. KDE reveals smooth density curve.

plt.subplot(1, 3, 2)
sns.boxplot(y=df['cost_of_the_order'], color='lightblue')
plt.ylabel('Cost of Order ($)')
plt.title('Boxplot of Order Cost')
plt.grid(axis='y', alpha=0.3)
# Chart choice: Boxplot effectively identifies outliers and shows quartile distribution

plt.subplot(1, 3, 3)
plt.text(0.1, 0.5, f"Statistics:\n\n"
         f"Mean: ${df['cost_of_the_order'].mean():.2f}\n"
         f"Median: ${df['cost_of_the_order'].median():.2f}\n"
         f"Std Dev: ${df['cost_of_the_order'].std():.2f}\n"
         f"Min: ${df['cost_of_the_order'].min():.2f}\n"
         f"Max: ${df['cost_of_the_order'].max():.2f}\n"
         f"Q1: ${df['cost_of_the_order'].quantile(0.25):.2f}\n"
         f"Q3: ${df['cost_of_the_order'].quantile(0.75):.2f}",
         fontsize=12, verticalalignment='center')
plt.axis('off')
plt.title('Order Cost Statistics')

plt.tight_layout()
plt.show()

# Calculate concentration range
q1 = df['cost_of_the_order'].quantile(0.25)
q3 = df['cost_of_the_order'].quantile(0.75)

print("Order Cost Observations:")
print(f"- Right-skewed distribution (mean ${df['cost_of_the_order'].mean():.2f} > median ${df['cost_of_the_order'].median():.2f})")
print(f"- Most orders concentrated in the $10-$18 range, with peak around $12-$14")
print(f"- Middle 50% (IQR) fall between ${q1:.2f}-${q3:.2f}")
print(f"- Long tail extends to ${df['cost_of_the_order'].max():.2f} with high-cost outliers")

Order Cost Observations:
- Right-skewed distribution (mean $16.50 > median $14.14)
- Most orders concentrated in the $10-$18 range, with peak around $12-$14
- Middle 50% (IQR) fall between $12.08-$22.30
- Long tail extends to $35.41 with high-cost outliers

# 2. Distribution of Food Preparation Time
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(data=df, x='food_preparation_time', bins=20, kde=True,
             color='lightcoral', edgecolor='black')
plt.axvline(df['food_preparation_time'].mean(), color='blue', linestyle='--',
            linewidth=2, label=f'Mean: {df["food_preparation_time"].mean():.2f} min')
plt.axvline(df['food_preparation_time'].median(), color='green', linestyle='--',
            linewidth=2, alpha=0.7, label=f'Median: {df["food_preparation_time"].median():.2f} min')
plt.xlabel('Food Preparation Time (minutes)')
plt.ylabel('Frequency')
plt.title('Distribution of Food Preparation Time')
plt.legend()
plt.grid(alpha=0.3)
# Chart choice: Histogram reveals the distribution pattern. KDE shows the smooth density curve.

plt.subplot(1, 3, 2)
sns.boxplot(y=df['food_preparation_time'], color='lightcoral')
plt.ylabel('Food Preparation Time (minutes)')
plt.title('Boxplot of Food Preparation Time')
plt.grid(axis='y', alpha=0.3)
# Chart choice: Boxplot shows median, quartiles, and any outlier preparation times

plt.subplot(1, 3, 3)
df['food_preparation_time'].value_counts().sort_index().plot(kind='bar', color='lightcoral', edgecolor='black')
plt.xlabel('Preparation Time (minutes)')
plt.ylabel('Count')
plt.title('Count of Orders by Preparation Time')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
# Chart choice: Bar chart shows exact frequencies for each discrete time value

plt.tight_layout()
plt.show()

print("Food Preparation Time Observations:")
print(f"- Most common preparation time: {df['food_preparation_time'].mode()[0]} minutes")
print(f"- Relatively uniform distribution across 20-35 minute range")
print(f"- Mean: {df['food_preparation_time'].mean():.2f} min, Median: {df['food_preparation_time'].median():.2f} min")

Food Preparation Time Observations:
- Most common preparation time: 21 minutes
- Relatively uniform distribution across 20-35 minute range
- Mean: 27.37 min, Median: 27.00 min

# 3. Distribution of Delivery Time
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(data=df, x='delivery_time', bins=25, kde=True,
             color='lightgreen', edgecolor='black')
plt.axvline(df['delivery_time'].mean(), color='red', linestyle='--',
            linewidth=2, label=f'Mean: {df["delivery_time"].mean():.2f} min')
plt.axvline(df['delivery_time'].median(), color='blue', linestyle='--',
            linewidth=2, alpha=0.7, label=f'Median: {df["delivery_time"].median():.2f} min')
plt.xlabel('Delivery Time (minutes)')
plt.ylabel('Frequency')
plt.title('Distribution of Delivery Time')
plt.legend()
plt.grid(alpha=0.3)
# Chart choice: Histogram shows distribution pattern. KDE reveals smooth density curve.

plt.subplot(1, 3, 2)
sns.boxplot(y=df['delivery_time'], color='lightgreen')
plt.ylabel('Delivery Time (minutes)')
plt.title('Boxplot of Delivery Time')
plt.grid(axis='y', alpha=0.3)
# Chart choice: Boxplot identifies outliers and shows the spread of delivery times

plt.subplot(1, 3, 3)
df['delivery_time'].value_counts().sort_index().plot(kind='bar', color='lightgreen', edgecolor='black')
plt.xlabel('Delivery Time (minutes)')
plt.ylabel('Count')
plt.title('Count of Orders by Delivery Time')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
# Chart choice: Bar chart shows frequency of each specific delivery time

plt.tight_layout()
plt.show()

print("Delivery Time Observations:")
print(f"- Range: {df['delivery_time'].min()}-{df['delivery_time'].max()} minutes")
print(f"- Mean: {df['delivery_time'].mean():.2f} min, Median: {df['delivery_time'].median():.2f} min")
print(f"- Most deliveries completed within 20-28 minutes")
print(f"- Distribution appears relatively symmetric (mean ≈ median)")

Delivery Time Observations:
- Range: 15-33 minutes
- Mean: 24.16 min, Median: 25.00 min
- Most deliveries completed within 20-28 minutes
- Distribution appears relatively symmetric (mean ≈ median)

# 4. Distribution of Ratings
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Only consider rated orders (NaN already handled in data cleaning)
rated_df = df[df['rating'].notna()]

# Count plot with mean and median
sns.countplot(data=rated_df, x='rating', color='orange', edgecolor='black',
              order=sorted(rated_df['rating'].unique()), ax=axes[0])

min_rating = rated_df['rating'].min()
mean_pos = rated_df['rating'].mean() - min_rating
median_pos = rated_df['rating'].median() - min_rating

axes[0].axvline(mean_pos, color='red', linestyle='--',
                linewidth=2, label=f'Mean: {rated_df["rating"].mean():.2f}')
axes[0].axvline(median_pos, color='blue', linestyle='--',
                linewidth=2, alpha=0.7, label=f'Median: {rated_df["rating"].median():.2f}')
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Customer Ratings')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)
# Chart choice: Bar chart is ideal for discrete ordinal data (ratings 1-5)

# Percentage plot
total = len(rated_df)
rating_counts = rated_df['rating'].value_counts().sort_index()
axes[1].bar(rating_counts.index, (rating_counts / total) * 100,
            color='orange', edgecolor='black')
axes[1].set_xlabel('Rating')
axes[1].set_ylabel('Percentage (%)')
axes[1].set_title('Percentage Distribution of Ratings')
axes[1].set_xticks(sorted(rated_df['rating'].unique()))
axes[1].grid(axis='y', alpha=0.3)
# Chart choice: Percentage bar chart allows easy comparison of rating proportions

plt.tight_layout()
plt.show()

print("Rating Distribution Observations:")
print(f"- Mean rating: {rated_df['rating'].mean():.2f}, Median: {rated_df['rating'].median():.2f}")
print(f"- NO ratings below 3 stars - all rated orders received 3, 4, or 5 stars only")
print(f"- Rating of {rated_df['rating'].mode()[0]} is most frequent")
print(f"- Distribution is left-skewed (mean < median: {rated_df['rating'].mean():.2f} < {rated_df['rating'].median():.2f})")
print(f"- {(rated_df['rating'] >= 4).sum() / len(rated_df) * 100:.1f}% of rated orders gave 4 or 5 stars")
print(f"- However, 38.8% of orders are unrated (potential negative bias)")

Rating Distribution Observations:
- Mean rating: 4.34, Median: 5.00
- NO ratings below 3 stars - all rated orders received 3, 4, or 5 stars only
- Rating of 5.0 is most frequent
- Distribution is left-skewed (mean < median: 4.34 < 5.00)
- 83.8% of rated orders gave 4 or 5 stars
- However, 38.8% of orders are unrated (potential negative bias)

# 5. Distribution of Day of the Week
plt.figure(figsize=(10, 6))

day_counts = df['day_of_the_week'].value_counts()

plt.subplot(1, 2, 1)
sns.countplot(data=df, x='day_of_the_week', hue='day_of_the_week',
              palette={'Weekday': 'coral', 'Weekend': 'steelblue'},
              edgecolor='black', order=['Weekday', 'Weekend'], legend=False)
plt.xlabel('Day Type')
plt.ylabel('Number of Orders')
plt.title('Orders by Day Type')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
# Chart choice: Bar chart clearly compares two categories (weekday vs weekend)

plt.subplot(1, 2, 2)
plt.pie(day_counts.values, labels=day_counts.index, autopct='%1.1f%%',
        colors=['steelblue', 'coral'], startangle=90)
plt.title('Percentage of Orders by Day Type')
# Chart choice: Pie chart effectively shows proportions for a binary categorical variable

plt.tight_layout()
plt.show()

print("Day of Week Statistics:")
print(day_counts)
print(f"\nWeekend orders: {day_counts.get('Weekend', 0)} ({day_counts.get('Weekend', 0)/len(df)*100:.1f}%)")
print(f"Weekday orders: {day_counts.get('Weekday', 0)} ({day_counts.get('Weekday', 0)/len(df)*100:.1f}%)")

Day of Week Statistics:
day_of_the_week
Weekend    1351
Weekday     547
Name: count, dtype: int64

Weekend orders: 1351 (71.2%)
Weekday orders: 547 (28.8%)

# 6. Distribution of Cuisine Types
plt.figure(figsize=(12, 6))

cuisine_counts = df['cuisine_type'].value_counts()

plt.subplot(1, 2, 1)
# Get top 10 cuisines for filtering
top_10_cuisines = cuisine_counts.head(10).index
top_10_df = df[df['cuisine_type'].isin(top_10_cuisines)]

# Use seaborn countplot with proper ordering
sns.countplot(data=top_10_df, y='cuisine_type',
              order=top_10_cuisines, color='teal', edgecolor='black')
plt.xlabel('Number of Orders')
plt.ylabel('Cuisine Type')
plt.title('Top 10 Cuisines by Number of Orders')
plt.grid(axis='x', alpha=0.3)
# Chart choice: Horizontal bar chart is better for long category names (easier to read)

plt.subplot(1, 2, 2)
top_5_cuisines = cuisine_counts.head(5)
other_count = cuisine_counts[5:].sum()
pie_data = pd.concat([top_5_cuisines, pd.Series({'Others': other_count})])
plt.pie(pie_data.values, labels=pie_data.index, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Top 5 Cuisines vs Others')
# Chart choice: Pie chart shows market share of top cuisines vs rest

plt.tight_layout()
plt.show()

print("Top 10 Cuisine Types:")
print(cuisine_counts.head(10))

Top 10 Cuisine Types:
cuisine_type
American          584
Japanese          470
Italian           298
Chinese           215
Mexican            77
Indian             73
Middle Eastern     49
Mediterranean      46
Thai               19
French             18
Name: count, dtype: int64

# 7. Distribution of Restaurant Orders
plt.figure(figsize=(14, 6))

restaurant_counts = df['restaurant_name'].value_counts()

# Get top 15 restaurants for filtering
top_15_restaurants = restaurant_counts.head(15).index
top_15_df = df[df['restaurant_name'].isin(top_15_restaurants)]

# Use seaborn countplot with proper ordering
sns.countplot(data=top_15_df, y='restaurant_name',
              order=top_15_restaurants, color='purple', edgecolor='black')
plt.xlabel('Number of Orders')
plt.ylabel('Restaurant Name')
plt.title('Top 15 Restaurants by Number of Orders')
plt.grid(axis='x', alpha=0.3)
# Chart choice: Horizontal bar chart handles many long restaurant names better than vertical
plt.tight_layout()
plt.show()

print(f"Total unique restaurants: {df['restaurant_name'].nunique()}")
print(f"\nTop 5 Restaurants:")
print(restaurant_counts.head())

Total unique restaurants: 178

Top 5 Restaurants:
restaurant_name
Shake Shack                  219
The Meatball Shop            132
Blue Ribbon Sushi            119
Blue Ribbon Fried Chicken     96
Parm                          68
Name: count, dtype: int64

# Summary of Univariate Analysis
print("="*80)
print("SUMMARY OF UNIVARIATE ANALYSIS")
print("="*80)

print("\n1. ORDER COST:")
print("   - Right-skewed distribution with mean ~$16.50")
print("   - Most orders fall in the $5-$25 range")
print("   - Some outliers with higher costs (up to ~$35)")

print("\n2. FOOD PREPARATION TIME:")
print("   - Ranges from 20 to 35 minutes")
print("   - Relatively uniform distribution")
print("   - Average preparation time is ~27 minutes")

print("\n3. DELIVERY TIME:")
print("   - Ranges from 15 to 33 minutes")
print("   - Average delivery time is ~24 minutes")
print("   - Most deliveries completed within 20-28 minutes")

print("\n4. RATINGS:")
print("   - 38.8% of orders are not rated")
print("   - Among rated orders, ratings of 5 and 4 are most common")
print("   - Average rating is around 3.5-4.0")
print("   - Rating of 3 is the most frequent")

print("\n5. DAY OF THE WEEK:")
print("   - Weekend orders are more frequent than weekday orders")
print(f"   - Weekend: ~{day_counts.get('Weekend', 0)/len(df)*100:.1f}%")
print(f"   - Weekday: ~{day_counts.get('Weekday', 0)/len(df)*100:.1f}%")

print("\n6. CUISINE TYPES:")
print("   - American cuisine is the most popular")
print("   - Japanese, Italian, and Chinese are also highly ordered")
print(f"   - Total of {df['cuisine_type'].nunique()} different cuisine types")

print("\n7. RESTAURANTS:")
print(f"   - {df['restaurant_name'].nunique()} unique restaurants in the dataset")
print("   - Some restaurants receive significantly more orders than others")
print("   - Top restaurants appear to be popular chains (Shake Shack, etc.)")

================================================================================
SUMMARY OF UNIVARIATE ANALYSIS
================================================================================

1. ORDER COST:
   - Right-skewed distribution with mean ~$16.50
   - Most orders fall in the $5-$25 range
   - Some outliers with higher costs (up to ~$35)

2. FOOD PREPARATION TIME:
   - Ranges from 20 to 35 minutes
   - Relatively uniform distribution
   - Average preparation time is ~27 minutes

3. DELIVERY TIME:
   - Ranges from 15 to 33 minutes
   - Average delivery time is ~24 minutes
   - Most deliveries completed within 20-28 minutes

4. RATINGS:
   - 38.8% of orders are not rated
   - Among rated orders, ratings of 5 and 4 are most common
   - Average rating is around 3.5-4.0
   - Rating of 3 is the most frequent

5. DAY OF THE WEEK:
   - Weekend orders are more frequent than weekday orders
   - Weekend: ~71.2%
   - Weekday: ~28.8%

6. CUISINE TYPES:
   - American cuisine is the most popular
   - Japanese, Italian, and Chinese are also highly ordered
   - Total of 14 different cuisine types

7. RESTAURANTS:
   - 178 unique restaurants in the dataset
   - Some restaurants receive significantly more orders than others
   - Top restaurants appear to be popular chains (Shake Shack, etc.)

# Find top 5 restaurants by number of orders
top_5_restaurants = df['restaurant_name'].value_counts().head(5)

print("Top 5 Restaurants by Number of Orders:")
print("="*50)
for i, (restaurant, count) in enumerate(top_5_restaurants.items(), 1):
    print(f"{i}. {restaurant}: {count} orders")

# Visualize
plt.figure(figsize=(10, 6))
top_5_restaurants.plot(kind='barh', color='steelblue', edgecolor='black')
plt.xlabel('Number of Orders')
plt.ylabel('Restaurant Name')
plt.title('Top 5 Restaurants by Number of Orders')
plt.gca().invert_yaxis()
# Chart choice: Horizontal bar chart makes restaurant names easier to read and compare values
plt.tight_layout()
plt.show()

Top 5 Restaurants by Number of Orders:
==================================================
1. Shake Shack: 219 orders
2. The Meatball Shop: 132 orders
3. Blue Ribbon Sushi: 119 orders
4. Blue Ribbon Fried Chicken: 96 orders
5. Parm: 68 orders

# Filter weekend orders
weekend_orders = df[df['day_of_the_week'] == 'Weekend']

# Find most popular cuisine on weekends
weekend_cuisine = weekend_orders['cuisine_type'].value_counts()

print("Most Popular Cuisines on Weekends:")
print("="*50)
print(weekend_cuisine.head(10))

print(f"\nThe most popular cuisine on weekends is: {weekend_cuisine.index[0]}")
print(f"Number of orders: {weekend_cuisine.iloc[0]}")
print(f"Percentage of weekend orders: {weekend_cuisine.iloc[0]/len(weekend_orders)*100:.2f}%")

# Visualize top 5 weekend cuisines
plt.figure(figsize=(10, 6))
weekend_cuisine.head(5).plot(kind='bar', color='coral', edgecolor='black')
plt.xlabel('Cuisine Type')
plt.ylabel('Number of Orders')
plt.title('Top 5 Cuisines on Weekends')
plt.xticks(rotation=45, ha='right')
# Chart choice: Bar chart effectively compares cuisine popularity rankings
plt.tight_layout()
plt.show()

Most Popular Cuisines on Weekends:
==================================================
cuisine_type
American          415
Japanese          335
Italian           207
Chinese           163
Mexican            53
Indian             49
Middle Eastern     32
Mediterranean      32
Thai               15
French             13
Name: count, dtype: int64

The most popular cuisine on weekends is: American
Number of orders: 415
Percentage of weekend orders: 30.72%

# Calculate orders costing more than \$20
orders_above_20 = df[df['cost_of_the_order'] > 20]
count_above_20 = len(orders_above_20)
percentage_above_20 = (count_above_20 / len(df)) * 100

print(f"Orders costing more than $20: {count_above_20}")
print(f"Total orders: {len(df)}")
print(f"Percentage: {percentage_above_20:.2f}%")

# Additional breakdown
orders_0_5 = len(df[df['cost_of_the_order'] <= 5])
orders_5_20 = len(df[(df['cost_of_the_order'] > 5) & (df['cost_of_the_order'] <= 20)])
orders_above_20 = len(df[df['cost_of_the_order'] > 20])

print("\nCost Distribution:")
print(f"Orders ≤ $5: {orders_0_5} ({orders_0_5/len(df)*100:.2f}%)")
print(f"Orders $5-$20: {orders_5_20} ({orders_5_20/len(df)*100:.2f}%)")
print(f"Orders > $20: {orders_above_20} ({orders_above_20/len(df)*100:.2f}%)")

# Visualize
plt.figure(figsize=(10, 6))
categories = ['≤ $5', '$5-$20', '> $20']
counts = [orders_0_5, orders_5_20, orders_above_20]
colors = ['lightblue', 'lightgreen', 'lightcoral']

plt.bar(categories, counts, color=colors, edgecolor='black')
plt.xlabel('Order Cost Range')
plt.ylabel('Number of Orders')
plt.title('Distribution of Orders by Cost Range')
for i, v in enumerate(counts):
    plt.text(i, v + 20, f"{v}\n({v/len(df)*100:.1f}%)", ha='center', va='bottom')
# Chart choice: Bar chart with value labels clearly shows the distribution across three price tiers
plt.tight_layout()
plt.show()

Orders costing more than $20: 555
Total orders: 1898
Percentage: 29.24%

Cost Distribution:
Orders ≤ $5: 9 (0.47%)
Orders $5-$20: 1334 (70.28%)
Orders > $20: 555 (29.24%)

# Calculate mean delivery time
mean_delivery = df['delivery_time'].mean()
median_delivery = df['delivery_time'].median()
std_delivery = df['delivery_time'].std()

print(f"Mean delivery time: {mean_delivery:.2f} minutes")
print(f"Median delivery time: {median_delivery:.2f} minutes")
print(f"Standard deviation: {std_delivery:.2f} minutes")
print(f"Range: {df['delivery_time'].min()} - {df['delivery_time'].max()} minutes")

# Calculate total time (preparation + delivery)
df['total_time'] = df['food_preparation_time'] + df['delivery_time']
mean_total = df['total_time'].mean()

print(f"\nMean total time (preparation + delivery): {mean_total:.2f} minutes")

Mean delivery time: 24.16 minutes
Median delivery time: 25.00 minutes
Standard deviation: 4.97 minutes
Range: 15 - 33 minutes

Mean total time (preparation + delivery): 51.53 minutes

# Find top 3 customers by order frequency
top_3_customers = df['customer_id'].value_counts().head(3)

print("Top 3 Most Frequent Customers:")
print("="*50)
for i, (customer_id, order_count) in enumerate(top_3_customers.items(), 1):
    print(f"{i}. Customer ID: {customer_id} - {order_count} orders")

# Additional insights
total_orders_top3 = top_3_customers.sum()
print(f"\nTotal orders from top 3 customers: {total_orders_top3}")
print(f"Percentage of all orders: {total_orders_top3/len(df)*100:.2f}%")

# Visualize
plt.figure(figsize=(10, 6))
top_3_customers.plot(kind='bar', color='gold', edgecolor='black')
plt.xlabel('Customer ID')
plt.ylabel('Number of Orders')
plt.title('Top 3 Most Frequent Customers')
plt.xticks(rotation=0)
# Chart choice: Bar chart clearly highlights the top customers and their order frequency
plt.tight_layout()
plt.show()

Top 3 Most Frequent Customers:
==================================================
1. Customer ID: 52832 - 13 orders
2. Customer ID: 47440 - 10 orders
3. Customer ID: 83287 - 9 orders

Total orders from top 3 customers: 32
Percentage of all orders: 1.69%

# Pairplot + Correlation Analysis for Numeric Variables

import seaborn as sns
import matplotlib.pyplot as plt

# ------------------------------------------------------------
# 1. Select numeric columns for analysis
# ------------------------------------------------------------
numeric_cols = ['cost_of_the_order', 'food_preparation_time', 'delivery_time']

# ------------------------------------------------------------
# 2. Create pairplot
# ------------------------------------------------------------
pairplot_grid = sns.pairplot(
    df[numeric_cols],
    diag_kind="kde",  # KDE curves on diagonal
    plot_kws={
        "alpha": 0.6,
        "s": 30
    },
    diag_kws={
        "linewidth": 2
    }
)

# Customize figure size and title
pairplot_grid.fig.set_size_inches(12, 10)
pairplot_grid.fig.suptitle(
    "Pairplot of Numeric Variables",
    fontsize=16,
    fontweight="bold",
    y=1.02
)

plt.show()

# ------------------------------------------------------------
# 3. Correlation matrix
# ------------------------------------------------------------
correlation_matrix = df[numeric_cols].corr()

print("\nCorrelation Matrix")
print("=" * 60)
print(correlation_matrix)

# ------------------------------------------------------------
# 4. Key correlation values
# ------------------------------------------------------------
print("\nKey Observations")
print("=" * 60)

print(f"1. Cost vs Prep Time:     {correlation_matrix.loc['cost_of_the_order', 'food_preparation_time']:.3f}")
print(f"2. Cost vs Delivery Time: {correlation_matrix.loc['cost_of_the_order', 'delivery_time']:.3f}")
print(f"3. Prep vs Delivery Time: {correlation_matrix.loc['food_preparation_time', 'delivery_time']:.3f}")

# ------------------------------------------------------------
# 5. Interpretation
# ------------------------------------------------------------
print("\nInsights:")
print("- Correlations are weak, suggesting limited linear relationships.")
print("- Cost distribution appears right-skewed (seen in KDE diagonal plots).")
print("- Preparation and delivery times are more symmetric.")
print("- Each operational metric may be optimized independently.")

Correlation Matrix
============================================================
                       cost_of_the_order  food_preparation_time  delivery_time
cost_of_the_order               1.000000               0.041527      -0.029949
food_preparation_time           0.041527               1.000000       0.011094
delivery_time                  -0.029949               0.011094       1.000000

Key Observations
============================================================
1. Cost vs Prep Time:     0.042
2. Cost vs Delivery Time: -0.030
3. Prep vs Delivery Time: 0.011

Insights:
- Correlations are weak, suggesting limited linear relationships.
- Cost distribution appears right-skewed (seen in KDE diagonal plots).
- Preparation and delivery times are more symmetric.
- Each operational metric may be optimized independently.

# 1. Correlation Matrix for Numerical Variables
numerical_cols = ['cost_of_the_order', 'food_preparation_time', 'delivery_time', 'total_time']
correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Variables')
# Chart choice: Heatmap effectively visualizes correlation strength and direction between multiple variables
plt.tight_layout()
plt.show()

print("Correlation Insights:")
print(correlation_matrix)

Correlation Insights:
                       cost_of_the_order  food_preparation_time  \
cost_of_the_order               1.000000               0.041527   
food_preparation_time           0.041527               1.000000   
delivery_time                  -0.029949               0.011094   
total_time                      0.006358               0.685970   

                       delivery_time  total_time  
cost_of_the_order          -0.029949    0.006358  
food_preparation_time       0.011094    0.685970  
delivery_time               1.000000    0.735195  
total_time                  0.735195    1.000000

# 2. Cost vs Day of Week
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df.boxplot(column='cost_of_the_order', by='day_of_the_week', ax=plt.gca())
plt.title('Order Cost Distribution by Day Type')
plt.suptitle('')
plt.xlabel('Day of the Week')
plt.ylabel('Cost of Order ($)')
# Chart choice: Boxplot compares distributions across categories and reveals outliers

plt.subplot(1, 2, 2)
weekday_mean = df[df['day_of_the_week'] == 'Weekday']['cost_of_the_order'].mean()
weekend_mean = df[df['day_of_the_week'] == 'Weekend']['cost_of_the_order'].mean()
plt.bar(['Weekday', 'Weekend'], [weekday_mean, weekend_mean], color=['steelblue', 'coral'], edgecolor='black')
plt.ylabel('Average Cost ($)')
plt.title('Average Order Cost by Day Type')
for i, v in enumerate([weekday_mean, weekend_mean]):
    plt.text(i, v + 0.3, f'${v:.2f}', ha='center', va='bottom')
# Chart choice: Bar chart with value labels makes mean comparison straightforward

plt.tight_layout()
plt.show()

print(f"Average cost on Weekdays: ${weekday_mean:.2f}")
print(f"Average cost on Weekends: ${weekend_mean:.2f}")
print(f"Difference: ${abs(weekend_mean - weekday_mean):.2f}")

Average cost on Weekdays: $16.31
Average cost on Weekends: $16.57
Difference: $0.26

# 3. Delivery Time vs Day of Week
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df.boxplot(column='delivery_time', by='day_of_the_week', ax=plt.gca())
plt.title('Delivery Time Distribution by Day Type')
plt.suptitle('')
plt.xlabel('Day of the Week')
plt.ylabel('Delivery Time (minutes)')
# Chart choice: Boxplot shows median, quartiles, and outliers for both groups

plt.subplot(1, 2, 2)
weekday_delivery = df[df['day_of_the_week'] == 'Weekday']['delivery_time'].mean()
weekend_delivery = df[df['day_of_the_week'] == 'Weekend']['delivery_time'].mean()
plt.bar(['Weekday', 'Weekend'], [weekday_delivery, weekend_delivery], color=['steelblue', 'coral'], edgecolor='black')
plt.ylabel('Average Delivery Time (minutes)')
plt.title('Average Delivery Time by Day Type')
for i, v in enumerate([weekday_delivery, weekend_delivery]):
    plt.text(i, v + 0.3, f'{v:.2f} min', ha='center', va='bottom')
# Chart choice: Bar chart highlights the difference (or similarity) in average times

plt.tight_layout()
plt.show()

print(f"Average delivery time on Weekdays: {weekday_delivery:.2f} minutes")
print(f"Average delivery time on Weekends: {weekend_delivery:.2f} minutes")

Average delivery time on Weekdays: 28.34 minutes
Average delivery time on Weekends: 22.47 minutes

# 4. Cost by Cuisine Type (Top 5)
top_cuisines = df['cuisine_type'].value_counts().head(5).index
top_cuisine_data = df[df['cuisine_type'].isin(top_cuisines)]

plt.figure(figsize=(12, 6))
sns.boxplot(data=top_cuisine_data, x='cuisine_type', y='cost_of_the_order')
plt.xlabel('Cuisine Type')
plt.ylabel('Cost of Order ($)')
plt.title('Order Cost Distribution by Top 5 Cuisine Types')
plt.xticks(rotation=45, ha='right')
# Chart choice: Boxplot compares cost distributions across multiple cuisine categories simultaneously
plt.tight_layout()
plt.show()

# Calculate mean cost by cuisine
cuisine_cost = df.groupby('cuisine_type')['cost_of_the_order'].mean().sort_values(ascending=False)
print("\nAverage Cost by Cuisine Type (Top 10):")
print(cuisine_cost.head(10))

Average Cost by Cuisine Type (Top 10):
cuisine_type
French            19.793889
Southern          19.300588
Thai              19.207895
Spanish           18.994167
Middle Eastern    18.820612
Mexican           16.933117
Indian            16.919726
Italian           16.418691
American          16.319829
Chinese           16.305209
Name: cost_of_the_order, dtype: float64

# Correlation between numeric variables (including cleaned ratings)
corr_cols = ['cost_of_the_order', 'food_preparation_time', 'delivery_time', 'rating_num']
corr = df[corr_cols].corr(numeric_only=True)
corr

plt.figure(figsize=(6,4))
plt.imshow(corr, aspect='auto')
plt.xticks(range(len(corr.columns)), corr.columns, rotation=45, ha='right')
plt.yticks(range(len(corr.index)), corr.index)
plt.colorbar()
plt.title("Correlation heatmap (numeric variables)")
plt.tight_layout()
plt.show()

from scipy import stats

weekend = df.loc[df['day_of_the_week']=='Weekend', 'delivery_time']
weekday = df.loc[df['day_of_the_week']=='Weekday', 'delivery_time']

mean_weekend = weekend.mean()
mean_weekday = weekday.mean()
diff = mean_weekend - mean_weekday

t_stat, p_val = stats.ttest_ind(weekend, weekday, equal_var=False, nan_policy='omit')

pd.DataFrame([{
    "mean_delivery_weekend": round(mean_weekend,2),
    "mean_delivery_weekday": round(mean_weekday,2),
    "difference_weekend_minus_weekday": round(diff,2),
    "welch_ttest_p_value": p_val
}])

df['total_time'] = df['food_preparation_time'] + df['delivery_time']

kpi_by_cuisine = (
    df.groupby('cuisine_type')
      .agg(
          orders=('order_id','count'),
          avg_cost=('cost_of_the_order','mean'),
          avg_prep=('food_preparation_time','mean'),
          avg_delivery=('delivery_time','mean'),
          avg_total_time=('total_time','mean'),
          rating_rate=('rating_num', lambda s: s.notna().mean()),
          avg_rating=('rating_num','mean')
      )
      .sort_values('orders', ascending=False)
)

kpi_by_cuisine_display = kpi_by_cuisine.copy()
kpi_by_cuisine_display['avg_cost'] = kpi_by_cuisine_display['avg_cost'].round(2)
kpi_by_cuisine_display['avg_prep'] = kpi_by_cuisine_display['avg_prep'].round(2)
kpi_by_cuisine_display['avg_delivery'] = kpi_by_cuisine_display['avg_delivery'].round(2)
kpi_by_cuisine_display['avg_total_time'] = kpi_by_cuisine_display['avg_total_time'].round(2)
kpi_by_cuisine_display['rating_rate'] = (kpi_by_cuisine_display['rating_rate']*100).round(1)
kpi_by_cuisine_display['avg_rating'] = kpi_by_cuisine_display['avg_rating'].round(2)

kpi_by_cuisine_display.head(10)

top_restaurants = df['restaurant_name'].value_counts().head(10).index

kpi_by_restaurant = (
    df[df['restaurant_name'].isin(top_restaurants)]
      .groupby('restaurant_name')
      .agg(
          orders=('order_id','count'),
          avg_cost=('cost_of_the_order','mean'),
          avg_total_time=('total_time','mean'),
          rating_rate=('rating_num', lambda s: s.notna().mean()),
          avg_rating=('rating_num','mean')
      )
      .sort_values('orders', ascending=False)
)

kpi_by_restaurant_display = kpi_by_restaurant.copy()
kpi_by_restaurant_display['avg_cost'] = kpi_by_restaurant_display['avg_cost'].round(2)
kpi_by_restaurant_display['avg_total_time'] = kpi_by_restaurant_display['avg_total_time'].round(2)
kpi_by_restaurant_display['rating_rate'] = (kpi_by_restaurant_display['rating_rate']*100).round(1)
kpi_by_restaurant_display['avg_rating'] = kpi_by_restaurant_display['avg_rating'].round(2)

kpi_by_restaurant_display

# 5. Ratings vs Cost
rated_data = df[df['rating'].notna()].copy()

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.boxplot(data=rated_data, x='rating', y='cost_of_the_order')
plt.xlabel('Rating')
plt.ylabel('Cost of Order ($)')
plt.title('Order Cost by Rating')
# Chart choice: Boxplot shows if higher/lower cost orders receive different ratings

plt.subplot(1, 2, 2)
rating_cost = rated_data.groupby('rating')['cost_of_the_order'].mean()
rating_cost.plot(kind='bar', color='orange', edgecolor='black')
plt.xlabel('Rating')
plt.ylabel('Average Cost ($)')
plt.title('Average Order Cost by Rating')
plt.xticks(rotation=0)
# Chart choice: Bar chart shows the trend of average cost across rating levels

plt.tight_layout()
plt.show()

print("Average Cost by Rating:")
print(rating_cost)

Average Cost by Rating:
rating
3.0    16.223351
4.0    16.710337
5.0    16.965697
Name: cost_of_the_order, dtype: float64

# 6. Preparation Time vs Delivery Time
plt.figure(figsize=(10, 6))
plt.scatter(df['food_preparation_time'], df['delivery_time'], alpha=0.5, s=30)
plt.xlabel('Food Preparation Time (minutes)')
plt.ylabel('Delivery Time (minutes)')
plt.title('Relationship between Preparation Time and Delivery Time')
plt.grid(True, alpha=0.3)
# Chart choice: Scatter plot reveals any correlation or patterns between two continuous variables
plt.tight_layout()
plt.show()

correlation = df['food_preparation_time'].corr(df['delivery_time'])
print(f"Correlation between preparation time and delivery time: {correlation:.3f}")

Correlation between preparation time and delivery time: 0.011

# 7. Order Count and Average Cost by Cuisine and Day
cuisine_day_analysis = df.groupby(['cuisine_type', 'day_of_the_week']).agg({
    'order_id': 'count',
    'cost_of_the_order': 'mean'
}).reset_index()
cuisine_day_analysis.columns = ['cuisine_type', 'day_of_the_week', 'order_count', 'avg_cost']

# Filter top 5 cuisines
top_5_cuisine_day = cuisine_day_analysis[cuisine_day_analysis['cuisine_type'].isin(top_cuisines)]

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
pivot_count = top_5_cuisine_day.pivot(index='cuisine_type', columns='day_of_the_week', values='order_count')
pivot_count.plot(kind='bar', ax=plt.gca(), color=['steelblue', 'coral'], edgecolor='black')
plt.ylabel('Number of Orders')
plt.xlabel('Cuisine Type')
plt.title('Order Count by Cuisine and Day Type')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Day Type')
# Chart choice: Grouped bar chart compares two categorical variables simultaneously

plt.subplot(1, 2, 2)
pivot_cost = top_5_cuisine_day.pivot(index='cuisine_type', columns='day_of_the_week', values='avg_cost')
pivot_cost.plot(kind='bar', ax=plt.gca(), color=['steelblue', 'coral'], edgecolor='black')
plt.ylabel('Average Cost ($)')
plt.xlabel('Cuisine Type')
plt.title('Average Cost by Cuisine and Day Type')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Day Type')
# Chart choice: Grouped bar chart reveals if pricing varies by day for each cuisine

plt.tight_layout()
plt.show()

# 8. Total Time vs Cost
plt.figure(figsize=(10, 6))
plt.scatter(df['total_time'], df['cost_of_the_order'], alpha=0.5, s=30, c=df['cost_of_the_order'], cmap='viridis')
plt.colorbar(label='Cost ($)')
plt.xlabel('Total Time (Preparation + Delivery) in minutes')
plt.ylabel('Cost of Order ($)')
plt.title('Relationship between Total Time and Order Cost')
plt.grid(True, alpha=0.3)
# Chart choice: Scatter plot with color gradient shows relationship and highlights high-value orders
plt.tight_layout()
plt.show()

correlation_time_cost = df['total_time'].corr(df['cost_of_the_order'])
print(f"Correlation between total time and cost: {correlation_time_cost:.3f}")

Correlation between total time and cost: 0.006

# Summary of Multivariate Analysis
print("="*80)
print("SUMMARY OF MULTIVARIATE ANALYSIS")
print("="*80)

print("\n1. CORRELATION INSIGHTS:")
print("   - Total time is strongly correlated with both preparation and delivery times (by definition)")
print("   - Weak correlation between cost and preparation/delivery times")
print("   - Preparation time and delivery time show weak correlation")

print("\n2. COST PATTERNS:")
print(f"   - Weekend orders slightly higher cost: ${weekend_mean:.2f} vs ${weekday_mean:.2f}")
print("   - Cuisine type significantly affects order cost")
print("   - No strong relationship between rating and cost")

print("\n3. TIME PATTERNS:")
print(f"   - Delivery time similar between weekdays ({weekday_delivery:.1f} min) and weekends ({weekend_delivery:.1f} min)")
print("   - Total time (prep + delivery) averages ~51 minutes")
print("   - Preparation and delivery times are relatively independent")

print("\n4. CUISINE INSIGHTS:")
print("   - American cuisine dominates both weekday and weekend orders")
print("   - Different cuisines show varying average costs")
print("   - Popular cuisines (American, Japanese, Italian) maintain high order volumes")

print("\n5. RATING INSIGHTS:")
print("   - Ratings don't significantly correlate with order cost")
print("   - Higher ratings (4-5) are more common regardless of order value")
print("   - Rating of 3 is most frequent among rated orders")

================================================================================
SUMMARY OF MULTIVARIATE ANALYSIS
================================================================================

1. CORRELATION INSIGHTS:
   - Total time is strongly correlated with both preparation and delivery times (by definition)
   - Weak correlation between cost and preparation/delivery times
   - Preparation time and delivery time show weak correlation

2. COST PATTERNS:
   - Weekend orders slightly higher cost: $16.57 vs $16.31
   - Cuisine type significantly affects order cost
   - No strong relationship between rating and cost

3. TIME PATTERNS:
   - Delivery time similar between weekdays (28.3 min) and weekends (22.5 min)
   - Total time (prep + delivery) averages ~51 minutes
   - Preparation and delivery times are relatively independent

4. CUISINE INSIGHTS:
   - American cuisine dominates both weekday and weekend orders
   - Different cuisines show varying average costs
   - Popular cuisines (American, Japanese, Italian) maintain high order volumes

5. RATING INSIGHTS:
   - Ratings don't significantly correlate with order cost
   - Higher ratings (4-5) are more common regardless of order value
   - Rating of 3 is most frequent among rated orders

# Group by restaurant and calculate rating statistics (ratings already cleaned to numeric)
restaurant_ratings = df[df['rating'].notna()].groupby('restaurant_name').agg({
    'rating': ['count', 'mean']
}).reset_index()
restaurant_ratings.columns = ['restaurant_name', 'rating_count', 'avg_rating']

# Apply filters: rating_count > 50 AND avg_rating > 4
eligible_restaurants = restaurant_ratings[
    (restaurant_ratings['rating_count'] > 50) &
    (restaurant_ratings['avg_rating'] > 4)
].sort_values('avg_rating', ascending=False)

print("Restaurants Eligible for Promotional Offer:")
print("="*80)
print(f"Criteria: Rating count > 50 AND Average rating > 4")
print("="*80)

if len(eligible_restaurants) > 0:
    print(f"\nTotal eligible restaurants: {len(eligible_restaurants)}\n")
    for i, row in eligible_restaurants.iterrows():
        print(f"{row['restaurant_name']}:")
        print(f"  - Rating Count: {int(row['rating_count'])}")
        print(f"  - Average Rating: {row['avg_rating']:.2f}")
        print()

    # Visualize eligible restaurants
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    eligible_restaurants.set_index('restaurant_name')['avg_rating'].plot(kind='barh', color='green', edgecolor='black')
    plt.xlabel('Average Rating')
    plt.title('Average Rating of Eligible Restaurants')
    plt.xlim(4, 5)
    # Chart choice: Horizontal bar chart for easier restaurant name reading and rating comparison

    plt.subplot(1, 2, 2)
    eligible_restaurants.set_index('restaurant_name')['rating_count'].plot(kind='barh', color='blue', edgecolor='black')
    plt.xlabel('Number of Ratings')
    plt.title('Rating Count of Eligible Restaurants')
    # Chart choice: Horizontal bar chart shows volume of customer feedback

    plt.tight_layout()
    plt.show()
else:
    print("\nNo restaurants meet the criteria.")
    print("Let's examine restaurants close to meeting the criteria:\n")

    # Show restaurants with >50 ratings
    high_volume = restaurant_ratings[restaurant_ratings['rating_count'] > 50].sort_values('avg_rating', ascending=False)
    print("Restaurants with >50 ratings:")
    print(high_volume.head(10))

    # Show restaurants with >4 average rating
    high_rating = restaurant_ratings[restaurant_ratings['avg_rating'] > 4].sort_values('rating_count', ascending=False)
    print("\nRestaurants with >4 average rating:")
    print(high_rating.head(10))

Restaurants Eligible for Promotional Offer:
================================================================================
Criteria: Rating count > 50 AND Average rating > 4
================================================================================

Total eligible restaurants: 4

The Meatball Shop:
  - Rating Count: 84
  - Average Rating: 4.51

Blue Ribbon Fried Chicken:
  - Rating Count: 64
  - Average Rating: 4.33

Shake Shack:
  - Rating Count: 133
  - Average Rating: 4.28

Blue Ribbon Sushi:
  - Rating Count: 73
  - Average Rating: 4.22

# Calculate commission based on order cost
def calculate_commission(cost):
    if cost > 20:
        return cost * 0.25
    elif cost > 5:
        return cost * 0.15
    else:
        return 0

# Apply commission calculation
df['commission'] = df['cost_of_the_order'].apply(calculate_commission)

# Calculate total revenue
total_revenue = df['commission'].sum()
total_order_value = df['cost_of_the_order'].sum()
avg_commission_rate = (total_revenue / total_order_value) * 100

# Breakdown by category
orders_above_20 = df[df['cost_of_the_order'] > 20]
orders_5_to_20 = df[(df['cost_of_the_order'] > 5) & (df['cost_of_the_order'] <= 20)]
orders_below_5 = df[df['cost_of_the_order'] <= 5]

revenue_above_20 = orders_above_20['commission'].sum()
revenue_5_to_20 = orders_5_to_20['commission'].sum()
revenue_below_5 = orders_below_5['commission'].sum()

print("REVENUE ANALYSIS")
print("="*80)
print(f"\nTotal Revenue Generated: ${total_revenue:,.2f}")
print(f"Total Order Value: ${total_order_value:,.2f}")
print(f"Average Commission Rate: {avg_commission_rate:.2f}%")

print("\n" + "="*80)
print("REVENUE BREAKDOWN BY ORDER VALUE:")
print("="*80)

print(f"\n1. Orders > $20 (25% commission):")
print(f"   - Number of orders: {len(orders_above_20)}")
print(f"   - Total order value: ${orders_above_20['cost_of_the_order'].sum():,.2f}")
print(f"   - Revenue generated: ${revenue_above_20:,.2f}")
print(f"   - Percentage of total revenue: {revenue_above_20/total_revenue*100:.2f}%")

print(f"\n2. Orders $5-$20 (15% commission):")
print(f"   - Number of orders: {len(orders_5_to_20)}")
print(f"   - Total order value: ${orders_5_to_20['cost_of_the_order'].sum():,.2f}")
print(f"   - Revenue generated: ${revenue_5_to_20:,.2f}")
print(f"   - Percentage of total revenue: {revenue_5_to_20/total_revenue*100:.2f}%")

print(f"\n3. Orders ≤ $5 (0% commission):")
print(f"   - Number of orders: {len(orders_below_5)}")
print(f"   - Total order value: ${orders_below_5['cost_of_the_order'].sum():,.2f}")
print(f"   - Revenue generated: ${revenue_below_5:,.2f}")

# Visualize revenue distribution
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
categories = ['Orders > $20\n(25%)', 'Orders $5-$20\n(15%)', 'Orders ≤ $5\n(0%)']
revenues = [revenue_above_20, revenue_5_to_20, revenue_below_5]
colors = ['#2ecc71', '#3498db', '#e74c3c']
plt.bar(categories, revenues, color=colors, edgecolor='black')
plt.ylabel('Revenue ($)')
plt.title('Revenue by Order Category')
for i, v in enumerate(revenues):
    plt.text(i, v + 100, f'${v:,.0f}', ha='center', va='bottom')
# Chart choice: Bar chart with value labels clearly shows revenue contribution by tier

plt.subplot(1, 2, 2)
order_counts = [len(orders_above_20), len(orders_5_to_20), len(orders_below_5)]
plt.bar(categories, order_counts, color=colors, edgecolor='black')
plt.ylabel('Number of Orders')
plt.title('Order Volume by Category')
for i, v in enumerate(order_counts):
    plt.text(i, v + 20, f'{v}', ha='center', va='bottom')
# Chart choice: Bar chart shows volume vs revenue relationship across tiers

plt.tight_layout()
plt.show()

# Pie chart of revenue contribution
plt.figure(figsize=(10, 6))
plt.pie([revenue_above_20, revenue_5_to_20],
        labels=['Orders > $20 (25%)', 'Orders $5-$20 (15%)'],
        autopct='%1.1f%%',
        colors=['#2ecc71', '#3498db'],
        startangle=90)
plt.title('Revenue Contribution by Order Category')
# Chart choice: Pie chart effectively shows proportional revenue contribution
plt.tight_layout()
plt.show()

REVENUE ANALYSIS
================================================================================

Total Revenue Generated: $6,166.30
Total Order Value: $31,314.82
Average Commission Rate: 19.69%

================================================================================
REVENUE BREAKDOWN BY ORDER VALUE:
================================================================================

1. Orders > $20 (25% commission):
   - Number of orders: 555
   - Total order value: $14,754.91
   - Revenue generated: $3,688.73
   - Percentage of total revenue: 59.82%

2. Orders $5-$20 (15% commission):
   - Number of orders: 1334
   - Total order value: $16,517.17
   - Revenue generated: $2,477.58
   - Percentage of total revenue: 40.18%

3. Orders ≤ $5 (0% commission):
   - Number of orders: 9
   - Total order value: $42.74
   - Revenue generated: $0.00

# Total time already calculated earlier as: total_time = food_preparation_time + delivery_time
orders_above_60 = df[df['total_time'] > 60]
count_above_60 = len(orders_above_60)
percentage_above_60 = (count_above_60 / len(df)) * 100

print("TOTAL DELIVERY TIME ANALYSIS")
print("="*80)
print(f"Orders taking more than 60 minutes: {count_above_60}")
print(f"Total orders: {len(df)}")
print(f"Percentage: {percentage_above_60:.2f}%")

# Additional breakdown
orders_0_30 = len(df[df['total_time'] <= 30])
orders_30_45 = len(df[(df['total_time'] > 30) & (df['total_time'] <= 45)])
orders_45_60 = len(df[(df['total_time'] > 45) & (df['total_time'] <= 60)])
orders_above_60 = len(df[df['total_time'] > 60])

print("\n" + "="*80)
print("TOTAL TIME DISTRIBUTION:")
print("="*80)
print(f"≤ 30 minutes: {orders_0_30} orders ({orders_0_30/len(df)*100:.2f}%)")
print(f"30-45 minutes: {orders_30_45} orders ({orders_30_45/len(df)*100:.2f}%)")
print(f"45-60 minutes: {orders_45_60} orders ({orders_45_60/len(df)*100:.2f}%)")
print(f"> 60 minutes: {orders_above_60} orders ({orders_above_60/len(df)*100:.2f}%)")

print(f"\nAverage total time: {df['total_time'].mean():.2f} minutes")
print(f"Median total time: {df['total_time'].median():.2f} minutes")

# Visualize
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.hist(df['total_time'], bins=30, edgecolor='black', color='skyblue')
plt.axvline(60, color='red', linestyle='--', linewidth=2, label='60 minutes threshold')
plt.axvline(df['total_time'].mean(), color='green', linestyle='--', linewidth=2,
            label=f'Mean: {df["total_time"].mean():.1f} min')
plt.xlabel('Total Time (minutes)')
plt.ylabel('Frequency')
plt.title('Distribution of Total Delivery Time')
plt.legend()
# Chart choice: Histogram with threshold line visually shows what proportion exceeds 60 minutes

plt.subplot(1, 2, 2)
categories = ['≤ 30 min', '30-45 min', '45-60 min', '> 60 min']
counts = [orders_0_30, orders_30_45, orders_45_60, orders_above_60]
colors = ['#2ecc71', '#3498db', '#f39c12', '#e74c3c']
plt.bar(categories, counts, color=colors, edgecolor='black')
plt.ylabel('Number of Orders')
plt.title('Orders by Total Time Range')
plt.xticks(rotation=45, ha='right')
for i, v in enumerate(counts):
    plt.text(i, v + 20, f'{v}\n({v/len(df)*100:.1f}%)', ha='center', va='bottom')
# Chart choice: Bar chart with color coding (green to red) emphasizes delivery speed tiers

plt.tight_layout()
plt.show()

TOTAL DELIVERY TIME ANALYSIS
================================================================================
Orders taking more than 60 minutes: 200
Total orders: 1898
Percentage: 10.54%

================================================================================
TOTAL TIME DISTRIBUTION:
================================================================================
≤ 30 minutes: 0 orders (0.00%)
30-45 minutes: 363 orders (19.13%)
45-60 minutes: 1335 orders (70.34%)
> 60 minutes: 200 orders (10.54%)

Average total time: 51.53 minutes
Median total time: 52.00 minutes

# Calculate mean delivery time by day type
delivery_by_day = df.groupby('day_of_the_week')['delivery_time'].agg(['mean', 'median', 'std', 'min', 'max'])

print("DELIVERY TIME COMPARISON: WEEKDAYS VS WEEKENDS")
print("="*80)
print(delivery_by_day)

weekday_mean = df[df['day_of_the_week'] == 'Weekday']['delivery_time'].mean()
weekend_mean = df[df['day_of_the_week'] == 'Weekend']['delivery_time'].mean()
difference = abs(weekend_mean - weekday_mean)

print("\n" + "="*80)
print(f"Mean delivery time on Weekdays: {weekday_mean:.2f} minutes")
print(f"Mean delivery time on Weekends: {weekend_mean:.2f} minutes")
print(f"Difference: {difference:.2f} minutes")

if weekend_mean > weekday_mean:
    print(f"Weekends are {difference:.2f} minutes slower ({difference/weekday_mean*100:.1f}% increase)")
else:
    print(f"Weekdays are {difference:.2f} minutes slower ({difference/weekend_mean*100:.1f}% increase)")

# Statistical test (t-test) to check if difference is significant
weekday_times = df[df['day_of_the_week'] == 'Weekday']['delivery_time']
weekend_times = df[df['day_of_the_week'] == 'Weekend']['delivery_time']
t_stat, p_value = stats.ttest_ind(weekday_times, weekend_times)

print(f"\nt-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")
if p_value < 0.05:
    print("The difference is statistically significant (p < 0.05)")
else:
    print("The difference is not statistically significant (p >= 0.05)")

# Visualize
plt.figure(figsize=(14, 6))

plt.subplot(1, 3, 1)
df.boxplot(column='delivery_time', by='day_of_the_week', ax=plt.gca())
plt.title('Delivery Time Distribution')
plt.suptitle('')
plt.ylabel('Delivery Time (minutes)')
# Chart choice: Boxplot compares distributions and shows if one group has more variability

plt.subplot(1, 3, 2)
means = [weekday_mean, weekend_mean]
plt.bar(['Weekday', 'Weekend'], means, color=['steelblue', 'coral'], edgecolor='black')
plt.ylabel('Mean Delivery Time (minutes)')
plt.title('Average Delivery Time Comparison')
for i, v in enumerate(means):
    plt.text(i, v + 0.3, f'{v:.2f} min', ha='center', va='bottom')
# Chart choice: Bar chart with value labels makes the mean difference immediately visible

plt.subplot(1, 3, 3)
weekday_hist = weekday_times.value_counts().sort_index()
weekend_hist = weekend_times.value_counts().sort_index()
plt.hist([weekday_times, weekend_times], bins=20, label=['Weekday', 'Weekend'],
         color=['steelblue', 'coral'], alpha=0.7, edgecolor='black')
plt.xlabel('Delivery Time (minutes)')
plt.ylabel('Frequency')
plt.title('Delivery Time Distribution Overlay')
plt.legend()
# Chart choice: Overlapping histograms show if the distributions have different shapes

plt.tight_layout()
plt.show()

DELIVERY TIME COMPARISON: WEEKDAYS VS WEEKENDS
================================================================================
                      mean  median       std  min  max
day_of_the_week                                       
Weekday          28.340037    28.0  2.891428   24   33
Weekend          22.470022    22.0  4.628938   15   30

================================================================================
Mean delivery time on Weekdays: 28.34 minutes
Mean delivery time on Weekends: 22.47 minutes
Difference: 5.87 minutes
Weekdays are 5.87 minutes slower (26.1% increase)

t-statistic: 27.5591
p-value: 0.0000
The difference is statistically significant (p < 0.05)

	metric	mean	CI_low	CI_high
0	Mean delivery_time	24.16	23.94	24.38
1	Mean food_preparation_time	27.37	27.17	27.58
2	Mean cost_of_the_order	16.50	16.16	16.83

	orders	avg_cost	avg_prep	avg_delivery	avg_total_time	rating_rate	avg_rating
cuisine_type
American	584	16.32	27.44	24.19	51.63	63.0	4.30
Japanese	470	16.30	27.51	24.13	51.64	58.1	4.37
Italian	298	16.42	27.48	24.57	52.05	57.7	4.36
Chinese	215	16.31	27.51	23.86	51.37	61.9	4.34
Mexican	77	16.93	26.73	24.39	51.12	62.3	4.42
Indian	73	16.92	27.11	24.08	51.19	68.5	4.54
Middle Eastern	49	18.82	26.67	24.08	50.76	69.4	4.24
Mediterranean	46	15.47	27.00	23.59	50.59	69.6	4.22
Thai	19	19.21	27.32	23.16	50.47	47.4	4.67
French	18	19.79	26.89	25.33	52.22	55.6	4.30

	orders	avg_cost	avg_total_time	rating_rate	avg_rating
restaurant_name
Shake Shack	219	16.34	52.61	60.7	4.28
The Meatball Shop	132	16.25	51.42	63.6	4.51
Blue Ribbon Sushi	119	16.00	51.87	61.3	4.22
Blue Ribbon Fried Chicken	96	17.32	51.34	66.7	4.33
Parm	68	16.36	52.81	57.4	4.13
RedFarm Broadway	59	16.36	50.75	69.5	4.24
RedFarm Hudson	55	16.75	51.36	61.8	4.18
TAO	49	17.03	49.94	57.1	4.36
Han Dynasty	46	16.42	50.57	50.0	4.43
Blue Ribbon Sushi Bar & Grill	44	15.15	49.70	50.0	4.59

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4	25	24

	column	Q1	Q3	IQR	lower_bound	upper_bound	min	max
0	cost_of_the_order	12.08	22.2975	10.2175	-3.24625	37.62375	4.47	35.41
1	food_preparation_time	23.00	31.0000	8.0000	11.00000	43.00000	20.00	35.00
2	delivery_time	20.00	28.0000	8.0000	8.00000	40.00000	15.00	33.00

	cost_of_the_order	food_preparation_time	delivery_time	rating_num
cost_of_the_order	1.000000	0.041527	-0.029949	0.033983
food_preparation_time	0.041527	1.000000	0.011094	-0.006083
delivery_time	-0.029949	0.011094	1.000000	-0.009804
rating_num	0.033983	-0.006083	-0.009804	1.000000

Project Foundations for Data Science: FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Observations:¶

Data Cleaning¶

Question 1: How many rows and columns are present in the data? [0.5 mark]¶

Observations:¶

Question 2: What are the datatypes of the different columns in the dataset? (The info() function can be used) [0.5 mark]¶

Observations:¶

Question 3: Are there any missing values in the data? If yes, treat them using an appropriate method. [1 mark]¶

Observations:¶

Question 4: Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed? [2 marks]¶

Observations:¶

Data Quality Assessment¶

Outlier Detection Using IQR Method¶

Clean ratings into numeric¶

Confidence Intervals (Bootstrap)¶

Question 5: How many orders are not rated? [1 mark]¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Question 6: Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration.) [9 marks]¶

Question 7: Which are the top 5 restaurants in terms of the number of orders received? [1 mark]¶

Observations:¶

Question 8: Which is the most popular cuisine on weekends? [1 mark]¶

Observations:¶

Question 9: What percentage of the orders cost more than 20 dollars? [2 marks]¶

Observations:¶

Question 10: What is the mean order delivery time? [1 mark]¶

Observations:¶

Question 11: The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed. [1 mark]¶

Observations:¶

Multivariate Analysis¶

Pairplot for Numeric Variables¶

Question 12: Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables) [10 marks]¶

Exploratory Analysis:¶

What seems to drive customer ratings?¶

Weekday vs Weekend delivery time (basic statistical test)¶

KPI table by cuisine (operational + customer view)¶

KPI table for top restaurants (by volume)¶

Observations:¶

Question 14: The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders. [3 marks]¶

Observations:¶

Question 15: The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed? (The food has to be prepared and then delivered.) [2 marks]¶

Observations:¶

Question 16: The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends? [2 marks]¶

Observations:¶

Conclusion and Recommendations¶

Question 17: What are your conclusions from the analysis? What recommendations would you like to share to help improve the business? (You can use cuisine type and feedback ratings to drive your business recommendations.) [6 marks]¶

Conclusions¶

Recommendations (action-oriented)¶