801 أسطر
28 KiB
Python
801 أسطر
28 KiB
Python
# بسم الله الرحمن الرحيم
|
||
|
||
# بسم الله الرحمن الرحيم
|
||
|
||
# بسم الله الرحمن الرحيم
|
||
|
||
# بسم الله الرحمن الرحيم
|
||
|
||
import base64
|
||
import json
|
||
import pandas as pd
|
||
import os
|
||
import numpy as np
|
||
from datetime import datetime
|
||
import io
|
||
from difflib import SequenceMatcher
|
||
|
||
def sanitize_text(value):
|
||
"""
|
||
Sanitize text values to ensure they're UTF-8 compatible
|
||
"""
|
||
if pd.isna(value):
|
||
return ""
|
||
|
||
if isinstance(value, (int, float, np.integer, np.floating)):
|
||
return str(value)
|
||
|
||
if isinstance(value, str):
|
||
try:
|
||
return value.encode('utf-8', errors='ignore').decode('utf-8')
|
||
except:
|
||
cleaned = ''.join(char for char in value if ord(char) < 128 or char.isprintable())
|
||
return cleaned
|
||
|
||
try:
|
||
return str(value)
|
||
except:
|
||
return ""
|
||
|
||
def clean_site_name(name):
|
||
"""
|
||
Clean SiteName by standardizing similar values
|
||
"""
|
||
if pd.isna(name) or name == "":
|
||
return "Unknown"
|
||
|
||
name = str(name).strip().lower()
|
||
|
||
# Common variations mapping
|
||
site_mapping = {
|
||
'main': 'Main Site',
|
||
'main site': 'Main Site',
|
||
'mainstore': 'Main Site',
|
||
'main store': 'Main Site',
|
||
'north': 'North Site',
|
||
'north site': 'North Site',
|
||
'northstore': 'North Site',
|
||
'south': 'South Site',
|
||
'south site': 'South Site',
|
||
'southstore': 'South Site',
|
||
'east': 'East Site',
|
||
'east site': 'East Site',
|
||
'west': 'West Site',
|
||
'west site': 'West Site',
|
||
'central': 'Central Site',
|
||
'central site': 'Central Site'
|
||
}
|
||
|
||
for key, value in site_mapping.items():
|
||
if key in name:
|
||
return value
|
||
|
||
return name.title()
|
||
|
||
def create_brand_mapping(series):
|
||
"""
|
||
Dynamically create brand mapping by analyzing unique brand names
|
||
Uses first N characters and similarity matching
|
||
"""
|
||
# Get unique brand values (excluding nulls and unknowns)
|
||
unique_brands = series.dropna().unique()
|
||
unique_brands = [str(b).strip() for b in unique_brands if str(b).strip() != "" and str(b).strip().lower() != "unknown"]
|
||
|
||
# Dictionary to store mappings
|
||
brand_map = {}
|
||
|
||
# First, group by first 3-5 characters
|
||
brand_groups = {}
|
||
|
||
for brand in unique_brands:
|
||
brand_lower = brand.lower()
|
||
|
||
# Try different prefix lengths
|
||
for prefix_len in [5, 4, 3]:
|
||
if len(brand_lower) >= prefix_len:
|
||
prefix = brand_lower[:prefix_len]
|
||
if prefix not in brand_groups:
|
||
brand_groups[prefix] = []
|
||
brand_groups[prefix].append(brand)
|
||
break
|
||
|
||
# For each group, find the most common/canonical name
|
||
for prefix, brands in brand_groups.items():
|
||
if len(brands) == 1:
|
||
# Single brand - use it as is (capitalized)
|
||
brand_map[brands[0].lower()] = brands[0].title()
|
||
else:
|
||
# Multiple brands with same prefix - find the most frequent or common one
|
||
# Count occurrences in the original series
|
||
brand_counts = series.value_counts()
|
||
|
||
# Find the brand with highest count in this group
|
||
best_match = max(brands, key=lambda b: brand_counts.get(b, 0))
|
||
canonical_name = best_match.title()
|
||
|
||
# Map all variations to the canonical name
|
||
for brand in brands:
|
||
brand_map[brand.lower()] = canonical_name
|
||
|
||
# Also check for brands that are substrings of others
|
||
sorted_brands = sorted(unique_brands, key=len, reverse=True)
|
||
for i, long_brand in enumerate(sorted_brands):
|
||
long_lower = long_brand.lower()
|
||
for short_brand in sorted_brands[i+1:]:
|
||
short_lower = short_brand.lower()
|
||
if short_lower in long_lower and len(short_lower) > 3:
|
||
# Short brand is a substring of long brand
|
||
if short_brand.lower() not in brand_map:
|
||
brand_map[short_lower] = long_brand.title()
|
||
|
||
return brand_map
|
||
|
||
def clean_brand_dynamic(brand, brand_mapping):
|
||
"""
|
||
Clean Brand names using dynamic mapping
|
||
"""
|
||
if pd.isna(brand) or brand == "":
|
||
return "Unknown"
|
||
|
||
brand_str = str(brand).strip()
|
||
brand_lower = brand_str.lower()
|
||
|
||
# Check if we have a mapping for this brand
|
||
if brand_lower in brand_mapping:
|
||
return brand_mapping[brand_lower]
|
||
|
||
# Try partial matching using first few characters
|
||
for key, value in brand_mapping.items():
|
||
# Check if brand starts with the same prefix
|
||
if len(brand_lower) >= 3 and len(key) >= 3:
|
||
if brand_lower[:3] == key[:3]:
|
||
return value
|
||
|
||
# If not found, return title case
|
||
return brand_str.title()
|
||
|
||
def calculate_age_from_dob(dob_value, transaction_date):
|
||
"""
|
||
Convert DOB to age based on transaction date, not today's date
|
||
Handles 1900-01-01 as Unknown
|
||
"""
|
||
if pd.isna(dob_value) or dob_value == "":
|
||
return "Unknown"
|
||
|
||
if pd.isna(transaction_date) or transaction_date == "":
|
||
return "Unknown"
|
||
|
||
dob_str = str(dob_value).strip()
|
||
|
||
# Check for the placeholder date
|
||
if dob_str.startswith('1900-01-01') or dob_str.startswith('1900/01/01') or dob_str == '1900-01-01':
|
||
return "Unknown"
|
||
|
||
try:
|
||
# Parse DOB
|
||
if '-' in dob_str:
|
||
dob = pd.to_datetime(dob_str.split()[0])
|
||
elif '/' in dob_str:
|
||
dob = pd.to_datetime(dob_str)
|
||
else:
|
||
return "Unknown"
|
||
|
||
# Parse Transaction Date
|
||
trans_date_str = str(transaction_date).strip()
|
||
if '-' in trans_date_str:
|
||
trans_date = pd.to_datetime(trans_date_str.split()[0])
|
||
elif '/' in trans_date_str:
|
||
trans_date = pd.to_datetime(trans_date_str)
|
||
else:
|
||
return "Unknown"
|
||
|
||
# Calculate age at time of transaction
|
||
age = trans_date.year - dob.year - ((trans_date.month, trans_date.day) < (dob.month, dob.day))
|
||
|
||
if age < 0 or age > 120: # Sanity check
|
||
return "Unknown"
|
||
|
||
return age
|
||
except:
|
||
return "Unknown"
|
||
|
||
def calculate_registration_duration(registration_date, transaction_date):
|
||
"""
|
||
Calculate number of days between registration and transaction
|
||
"""
|
||
if pd.isna(registration_date) or registration_date == "":
|
||
return "Unknown"
|
||
|
||
if pd.isna(transaction_date) or transaction_date == "":
|
||
return "Unknown"
|
||
|
||
try:
|
||
# Parse Registration Date
|
||
reg_str = str(registration_date).strip()
|
||
if '-' in reg_str:
|
||
reg_date = pd.to_datetime(reg_str.split()[0])
|
||
elif '/' in reg_str:
|
||
reg_date = pd.to_datetime(reg_str)
|
||
else:
|
||
return "Unknown"
|
||
|
||
# Parse Transaction Date
|
||
trans_str = str(transaction_date).strip()
|
||
if '-' in trans_str:
|
||
trans_date = pd.to_datetime(trans_str.split()[0])
|
||
elif '/' in trans_str:
|
||
trans_date = pd.to_datetime(trans_str)
|
||
else:
|
||
return "Unknown"
|
||
|
||
# Calculate days difference
|
||
days_diff = (trans_date - reg_date).days
|
||
|
||
if days_diff < 0:
|
||
return "0" # Transaction before registration - treat as 0
|
||
|
||
if days_diff > 3650: # Cap at 10 years (sanity check)
|
||
return "3650+"
|
||
|
||
return days_diff
|
||
except:
|
||
return "Unknown"
|
||
|
||
def merge_contact_methods(row):
|
||
"""
|
||
Merge Email, SMS, Mail, Phone into one column with priority order
|
||
"""
|
||
contact_methods = []
|
||
|
||
if row.get('ContactByEmail') == 1 or str(row.get('ContactByEmail', '')).lower() == 'true' or str(row.get('ContactByEmail', '')).lower() == 'yes':
|
||
contact_methods.append('Email')
|
||
if row.get('ContactBySMS') == 1 or str(row.get('ContactBySMS', '')).lower() == 'true' or str(row.get('ContactBySMS', '')).lower() == 'yes':
|
||
contact_methods.append('SMS')
|
||
if row.get('ContactByMail') == 1 or str(row.get('ContactByMail', '')).lower() == 'true' or str(row.get('ContactByMail', '')).lower() == 'yes':
|
||
contact_methods.append('Mail')
|
||
if row.get('ContactByPhone') == 1 or str(row.get('ContactByPhone', '')).lower() == 'true' or str(row.get('ContactByPhone', '')).lower() == 'yes':
|
||
contact_methods.append('Phone')
|
||
|
||
if not contact_methods:
|
||
return 'NoContact'
|
||
|
||
return ','.join(contact_methods) # Return all methods as comma-separated
|
||
|
||
def extract_date_components(date_value, column_name, reference_date=None):
|
||
"""
|
||
Extract Year, Month, TimeOfMonth, Day from date
|
||
"""
|
||
if pd.isna(date_value) or date_value == "":
|
||
return {
|
||
f'{column_name}_Year': "Unknown",
|
||
f'{column_name}_Month': "Unknown",
|
||
f'{column_name}_TimeOfMonth': "Unknown",
|
||
f'{column_name}_Day': "Unknown"
|
||
}
|
||
|
||
try:
|
||
# Parse the date
|
||
date_str = str(date_value).strip()
|
||
if '-' in date_str:
|
||
date_obj = pd.to_datetime(date_str.split()[0])
|
||
elif '/' in date_str:
|
||
date_obj = pd.to_datetime(date_str)
|
||
else:
|
||
return {
|
||
f'{column_name}_Year': "Unknown",
|
||
f'{column_name}_Month': "Unknown",
|
||
f'{column_name}_TimeOfMonth': "Unknown",
|
||
f'{column_name}_Day': "Unknown"
|
||
}
|
||
|
||
# Extract components
|
||
year = date_obj.year
|
||
|
||
month_names = ['January', 'February', 'March', 'April', 'May', 'June',
|
||
'July', 'August', 'September', 'October', 'November', 'December']
|
||
month = month_names[date_obj.month - 1]
|
||
|
||
day_num = date_obj.day
|
||
if 1 <= day_num <= 10:
|
||
time_of_month = "Beginning (1-10)"
|
||
elif 11 <= day_num <= 20:
|
||
time_of_month = "Middle (11-20)"
|
||
else:
|
||
time_of_month = "End (21-31)"
|
||
|
||
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
|
||
day = day_names[date_obj.weekday()]
|
||
|
||
return {
|
||
f'{column_name}_Year': year,
|
||
f'{column_name}_Month': month,
|
||
f'{column_name}_TimeOfMonth': time_of_month,
|
||
f'{column_name}_Day': day
|
||
}
|
||
except:
|
||
return {
|
||
f'{column_name}_Year': "Unknown",
|
||
f'{column_name}_Month': "Unknown",
|
||
f'{column_name}_TimeOfMonth': "Unknown",
|
||
f'{column_name}_Day': "Unknown"
|
||
}
|
||
|
||
def add_recurring_customer_flag(df, userid_column='Userid'):
|
||
"""
|
||
Add a flag indicating if customer is recurring (has multiple transactions)
|
||
"""
|
||
# Count transactions per user
|
||
user_transaction_counts = df[userid_column].value_counts()
|
||
|
||
# Create flag: 1 if more than 1 transaction, 0 otherwise
|
||
df['IsRecurringCustomer'] = df[userid_column].map(
|
||
lambda x: 1 if user_transaction_counts.get(x, 0) > 1 else 0
|
||
)
|
||
|
||
print(f" 🔄 Added 'IsRecurringCustomer' flag: {df['IsRecurringCustomer'].sum()} recurring customers out of {df[userid_column].nunique()} unique users")
|
||
|
||
return df
|
||
|
||
def transform_dataframe(df):
|
||
"""
|
||
Apply all transformations to the dataframe
|
||
"""
|
||
print("\n 🔄 Applying transformations...")
|
||
|
||
# A > Keep Userid and add recurring customer flag
|
||
if 'Userid' in df.columns:
|
||
print(" ✅ Keeping 'Userid' and adding recurring customer flag")
|
||
df = add_recurring_customer_flag(df, 'Userid')
|
||
else:
|
||
print(" ⚠️ 'Userid' column not found")
|
||
|
||
# B > Drop StoreId (same value)
|
||
if 'StoreId' in df.columns:
|
||
df = df.drop(columns=['StoreId'])
|
||
print(" 🗑️ Dropped 'StoreId'")
|
||
|
||
# C > Drop Store (same value)
|
||
if 'Store' in df.columns:
|
||
df = df.drop(columns=['Store'])
|
||
print(" 🗑️ Dropped 'Store'")
|
||
|
||
# D > Drop ParentSiteId (same value)
|
||
if 'ParentSiteId' in df.columns:
|
||
df = df.drop(columns=['ParentSiteId'])
|
||
print(" 🗑️ Dropped 'ParentSiteId'")
|
||
|
||
# E > Drop SiteType (same value)
|
||
if 'SiteType' in df.columns:
|
||
df = df.drop(columns=['SiteType'])
|
||
print(" 🗑️ Dropped 'SiteType'")
|
||
|
||
# F > Keep Gender
|
||
if 'Gender' in df.columns:
|
||
print(" ✅ Keeping 'Gender'")
|
||
|
||
# G > Convert DOB to Age (using TransactionDate as reference)
|
||
if 'DOB' in df.columns and 'TransactionDate' in df.columns:
|
||
df['Age'] = df.apply(lambda row: calculate_age_from_dob(row['DOB'], row['TransactionDate']), axis=1)
|
||
df = df.drop(columns=['DOB'])
|
||
print(" ✅ Converted 'DOB' to 'Age' (using TransactionDate as reference)")
|
||
elif 'DOB' in df.columns:
|
||
print(" ⚠️ 'DOB' found but 'TransactionDate' missing - cannot calculate Age properly")
|
||
|
||
# H > Convert RegistrationDate to duration (days since registration)
|
||
if 'RegistrationDate' in df.columns and 'TransactionDate' in df.columns:
|
||
df['DaysSinceRegistration'] = df.apply(lambda row: calculate_registration_duration(row['RegistrationDate'], row['TransactionDate']), axis=1)
|
||
df = df.drop(columns=['RegistrationDate'])
|
||
print(" ✅ Converted 'RegistrationDate' to 'DaysSinceRegistration' (days between registration and transaction)")
|
||
elif 'RegistrationDate' in df.columns:
|
||
print(" ⚠️ 'RegistrationDate' found but 'TransactionDate' missing - keeping as-is")
|
||
|
||
# I > Drop FirstLoginDate
|
||
if 'FirstLoginDate' in df.columns:
|
||
df = df.drop(columns=['FirstLoginDate'])
|
||
print(" 🗑️ Dropped 'FirstLoginDate'")
|
||
|
||
# J > Drop LastLoginDate
|
||
if 'LastLoginDate' in df.columns:
|
||
df = df.drop(columns=['LastLoginDate'])
|
||
print(" 🗑️ Dropped 'LastLoginDate'")
|
||
|
||
# K,L,M,N > Merge ContactBy columns
|
||
contact_columns = ['ContactByEmail', 'ContactBySMS', 'ContactByMail', 'ContactByPhone']
|
||
existing_contact_cols = [col for col in contact_columns if col in df.columns]
|
||
if existing_contact_cols:
|
||
df['ContactMethod'] = df.apply(merge_contact_methods, axis=1)
|
||
df = df.drop(columns=existing_contact_cols)
|
||
print(f" ✅ Merged {len(existing_contact_cols)} contact columns into 'ContactMethod'")
|
||
|
||
# O > Drop ContactStatus
|
||
if 'ContactStatus' in df.columns:
|
||
df = df.drop(columns=['ContactStatus'])
|
||
print(" 🗑️ Dropped 'ContactStatus'")
|
||
|
||
# P > Drop TermsConsent
|
||
if 'TermsConsent' in df.columns:
|
||
df = df.drop(columns=['TermsConsent'])
|
||
print(" 🗑️ Dropped 'TermsConsent'")
|
||
|
||
# Q > Drop CommunityName
|
||
if 'CommunityName' in df.columns:
|
||
df = df.drop(columns=['CommunityName'])
|
||
print(" 🗑️ Dropped 'CommunityName'")
|
||
|
||
# R > Drop CountryId
|
||
if 'CountryId' in df.columns:
|
||
df = df.drop(columns=['CountryId'])
|
||
print(" 🗑️ Dropped 'CountryId'")
|
||
|
||
# S > Keep Country
|
||
if 'Country' in df.columns:
|
||
print(" ✅ Keeping 'Country'")
|
||
|
||
# T > Drop StateCode
|
||
if 'StateCode' in df.columns:
|
||
df = df.drop(columns=['StateCode'])
|
||
print(" 🗑️ Dropped 'StateCode'")
|
||
|
||
# U > Keep StateName
|
||
if 'StateName' in df.columns:
|
||
print(" ✅ Keeping 'StateName'")
|
||
|
||
# V > Drop City
|
||
if 'City' in df.columns:
|
||
df = df.drop(columns=['City'])
|
||
print(" 🗑️ Dropped 'City'")
|
||
|
||
# W > Drop PostalCode
|
||
if 'PostalCode' in df.columns:
|
||
df = df.drop(columns=['PostalCode'])
|
||
print(" 🗑️ Dropped 'PostalCode'")
|
||
|
||
# X > Drop Title
|
||
if 'Title' in df.columns:
|
||
df = df.drop(columns=['Title'])
|
||
print(" 🗑️ Dropped 'Title'")
|
||
|
||
# Y > Drop Salutation
|
||
if 'Salutation' in df.columns:
|
||
df = df.drop(columns=['Salutation'])
|
||
print(" 🗑️ Dropped 'Salutation'")
|
||
|
||
# Z > Keep R
|
||
if 'R' in df.columns:
|
||
print(" ✅ Keeping 'R'")
|
||
|
||
# AA > Keep F
|
||
if 'F' in df.columns:
|
||
print(" ✅ Keeping 'F'")
|
||
|
||
# AB > Keep M
|
||
if 'M' in df.columns:
|
||
print(" ✅ Keeping 'M'")
|
||
|
||
# AC > Keep RFM
|
||
if 'RFM' in df.columns:
|
||
print(" ✅ Keeping 'RFM'")
|
||
|
||
# AD > Keep Tier
|
||
if 'Tier' in df.columns:
|
||
print(" ✅ Keeping 'Tier'")
|
||
|
||
# AE > Convert TransactionDate into date components
|
||
if 'TransactionDate' in df.columns:
|
||
date_components = df['TransactionDate'].apply(lambda x: extract_date_components(x, 'Transaction'))
|
||
date_df = pd.DataFrame(date_components.tolist())
|
||
df = pd.concat([df, date_df], axis=1)
|
||
df = df.drop(columns=['TransactionDate'])
|
||
print(" ✅ Converted 'TransactionDate' into 4 columns (Transaction_Year, Transaction_Month, Transaction_TimeOfMonth, Transaction_Day)")
|
||
|
||
# AF > Drop CreateDate (as requested - it's the same as TransactionDate)
|
||
if 'CreateDate' in df.columns:
|
||
df = df.drop(columns=['CreateDate'])
|
||
print(" 🗑️ Dropped 'CreateDate' (duplicate of TransactionDate)")
|
||
|
||
# AG > Drop MemberId
|
||
if 'MemberId' in df.columns:
|
||
df = df.drop(columns=['MemberId'])
|
||
print(" 🗑️ Dropped 'MemberId'")
|
||
|
||
# AH > Drop SiteId
|
||
if 'SiteId' in df.columns:
|
||
df = df.drop(columns=['SiteId'])
|
||
print(" 🗑️ Dropped 'SiteId'")
|
||
|
||
# AI > Clean and keep SiteName
|
||
if 'SiteName' in df.columns:
|
||
df['SiteName'] = df['SiteName'].apply(clean_site_name)
|
||
print(" ✅ Kept and cleaned 'SiteName'")
|
||
|
||
# AJ > Keep Quantity
|
||
if 'Quantity' in df.columns:
|
||
print(" ✅ Keeping 'Quantity'")
|
||
|
||
# AK > Keep Amount
|
||
if 'Amount' in df.columns:
|
||
print(" ✅ Keeping 'Amount'")
|
||
|
||
# AL > Drop RewardType
|
||
if 'RewardType' in df.columns:
|
||
df = df.drop(columns=['RewardType'])
|
||
print(" 🗑️ Dropped 'RewardType'")
|
||
|
||
# AM > Keep Points
|
||
if 'Points' in df.columns:
|
||
print(" ✅ Keeping 'Points'")
|
||
|
||
# AN > Drop trxDetailId
|
||
if 'trxDetailId' in df.columns:
|
||
df = df.drop(columns=['trxDetailId'])
|
||
print(" 🗑️ Dropped 'trxDetailId'")
|
||
|
||
# AO > Drop TrxId
|
||
if 'TrxId' in df.columns:
|
||
df = df.drop(columns=['TrxId'])
|
||
print(" 🗑️ Dropped 'TrxId'")
|
||
|
||
# AP > Drop TransactionStatusId
|
||
if 'TransactionStatusId' in df.columns:
|
||
df = df.drop(columns=['TransactionStatusId'])
|
||
print(" 🗑️ Dropped 'TransactionStatusId'")
|
||
|
||
# AQ > Keep TransactionStatusName
|
||
if 'TransactionStatusName' in df.columns:
|
||
print(" ✅ Keeping 'TransactionStatusName'")
|
||
|
||
# AR > Drop TransactionTypeId
|
||
if 'TransactionTypeId' in df.columns:
|
||
df = df.drop(columns=['TransactionTypeId'])
|
||
print(" 🗑️ Dropped 'TransactionTypeId'")
|
||
|
||
# AS > Keep TransactionTypeName
|
||
if 'TransactionTypeName' in df.columns:
|
||
print(" ✅ Keeping 'TransactionTypeName'")
|
||
|
||
# AT > Drop Reportable
|
||
if 'Reportable' in df.columns:
|
||
df = df.drop(columns=['Reportable'])
|
||
print(" 🗑️ Dropped 'Reportable'")
|
||
|
||
# AU > Keep TransactionItemCode
|
||
if 'TransactionItemCode' in df.columns:
|
||
print(" ✅ Keeping 'TransactionItemCode'")
|
||
|
||
# AV > Keep AnalysisCode1
|
||
if 'AnalysisCode1' in df.columns:
|
||
print(" ✅ Keeping 'AnalysisCode1'")
|
||
|
||
# AW > Keep AnalysisCode2
|
||
if 'AnalysisCode2' in df.columns:
|
||
print(" ✅ Keeping 'AnalysisCode2'")
|
||
|
||
# AX > Keep AnalysisCode3
|
||
if 'AnalysisCode3' in df.columns:
|
||
print(" ✅ Keeping 'AnalysisCode3'")
|
||
|
||
# AY > Keep AnalysisCode4
|
||
if 'AnalysisCode4' in df.columns:
|
||
print(" ✅ Keeping 'AnalysisCode4'")
|
||
|
||
# AZ > Dynamically clean Brand
|
||
if 'Brand' in df.columns:
|
||
print(" 🔍 Analyzing unique brand names to create dynamic mapping...")
|
||
brand_mapping = create_brand_mapping(df['Brand'])
|
||
print(f" 📊 Created mapping for {len(brand_mapping)} unique brand variations")
|
||
df['Brand'] = df['Brand'].apply(lambda x: clean_brand_dynamic(x, brand_mapping))
|
||
print(" ✅ Kept and dynamically cleaned 'Brand'")
|
||
|
||
# BA > Keep AnalysisCode6
|
||
if 'AnalysisCode6' in df.columns:
|
||
print(" ✅ Keeping 'AnalysisCode6'")
|
||
|
||
# BB > Keep AnalysisCode7
|
||
if 'AnalysisCode7' in df.columns:
|
||
print(" ✅ Keeping 'AnalysisCode7'")
|
||
|
||
# BC > Keep AnalysisCode8
|
||
if 'AnalysisCode8' in df.columns:
|
||
print(" ✅ Keeping 'AnalysisCode8'")
|
||
|
||
# BD > Keep Price
|
||
if 'Price' in df.columns:
|
||
print(" ✅ Keeping 'Price'")
|
||
|
||
# BE > Keep AnalysisCode10
|
||
if 'AnalysisCode10' in df.columns:
|
||
print(" ✅ Keeping 'AnalysisCode10'")
|
||
|
||
# BF > Keep InvalidReason
|
||
if 'InvalidReason' in df.columns:
|
||
print(" ✅ Keeping 'InvalidReason'")
|
||
|
||
# BG > Drop Description
|
||
if 'Description' in df.columns:
|
||
df = df.drop(columns=['Description'])
|
||
print(" 🗑️ Dropped 'Description'")
|
||
|
||
# BH > Drop PromotionId
|
||
if 'PromotionId' in df.columns:
|
||
df = df.drop(columns=['PromotionId'])
|
||
print(" 🗑️ Dropped 'PromotionId'")
|
||
|
||
# BI > Keep PromotionName
|
||
if 'PromotionName' in df.columns:
|
||
print(" ✅ Keeping 'PromotionName'")
|
||
|
||
# BJ > Convert PromotionStartDate into 4 columns
|
||
if 'PromotionStartDate' in df.columns:
|
||
date_components = df['PromotionStartDate'].apply(lambda x: extract_date_components(x, 'PromotionStart'))
|
||
date_df = pd.DataFrame(date_components.tolist())
|
||
df = pd.concat([df, date_df], axis=1)
|
||
df = df.drop(columns=['PromotionStartDate'])
|
||
print(" ✅ Converted 'PromotionStartDate' into 4 columns (PromotionStart_Year, PromotionStart_Month, PromotionStart_TimeOfMonth, PromotionStart_Day)")
|
||
|
||
# BK > Drop PromotionEndDate
|
||
if 'PromotionEndDate' in df.columns:
|
||
df = df.drop(columns=['PromotionEndDate'])
|
||
print(" 🗑️ Dropped 'PromotionEndDate'")
|
||
|
||
# BL > Drop PromotionOfferTypeId
|
||
if 'PromotionOfferTypeId' in df.columns:
|
||
df = df.drop(columns=['PromotionOfferTypeId'])
|
||
print(" 🗑️ Dropped 'PromotionOfferTypeId'")
|
||
|
||
# BM > Drop PromotionOfferTypeName
|
||
if 'PromotionOfferTypeName' in df.columns:
|
||
df = df.drop(columns=['PromotionOfferTypeName'])
|
||
print(" 🗑️ Dropped 'PromotionOfferTypeName'")
|
||
|
||
# BN > Drop PromotionSiteId
|
||
if 'PromotionSiteId' in df.columns:
|
||
df = df.drop(columns=['PromotionSiteId'])
|
||
print(" 🗑️ Dropped 'PromotionSiteId'")
|
||
|
||
# BO > Drop PromotionSite
|
||
if 'PromotionSite' in df.columns:
|
||
df = df.drop(columns=['PromotionSite'])
|
||
print(" 🗑️ Dropped 'PromotionSite'")
|
||
|
||
# BP > Drop QualifyingProductQuantity
|
||
if 'QualifyingProductQuantity' in df.columns:
|
||
df = df.drop(columns=['QualifyingProductQuantity'])
|
||
print(" 🗑️ Dropped 'QualifyingProductQuantity'")
|
||
|
||
print("\n ✅ All transformations completed!")
|
||
return df
|
||
|
||
def read_and_process_file(file_path, max_rows=5000):
|
||
"""
|
||
Read the Excel file and apply all transformations
|
||
"""
|
||
try:
|
||
print(f" 📖 Reading file: {file_path}")
|
||
|
||
# Read the Excel file
|
||
df = pd.read_excel(file_path)
|
||
|
||
print(f" 📊 Original columns: {list(df.columns)}")
|
||
print(f" 📏 Original shape: {df.shape}")
|
||
|
||
# Limit to first max_rows
|
||
original_row_count = len(df)
|
||
if len(df) > max_rows:
|
||
df = df.head(max_rows)
|
||
print(f" ✂️ Limited dataset to first {max_rows} rows (from {original_row_count} total rows)")
|
||
else:
|
||
print(f" ℹ️ Dataset has {len(df)} rows (within the {max_rows} row limit)")
|
||
|
||
# Apply all transformations
|
||
df = transform_dataframe(df)
|
||
|
||
# Sanitize all text data (final pass)
|
||
print("\n 🧹 Final sanitization of text data...")
|
||
for col in df.columns:
|
||
if df[col].dtype == 'object': # Only process string columns
|
||
df[col] = df[col].apply(sanitize_text)
|
||
|
||
# Convert DataFrame to CSV
|
||
csv_buffer = io.StringIO()
|
||
df.to_csv(csv_buffer, index=False, encoding='utf-8')
|
||
csv_content = csv_buffer.getvalue().encode('utf-8')
|
||
|
||
# Get original file name and create modified name
|
||
original_file_name = os.path.basename(file_path)
|
||
name, ext = os.path.splitext(original_file_name)
|
||
modified_file_name = f"{name}_transformed_{len(df)}_rows.csv"
|
||
|
||
print(f"\n ✅ Successfully processed file: {modified_file_name}")
|
||
print(f" 📊 Final columns: {list(df.columns)}")
|
||
print(f" 📏 Final shape: {df.shape}")
|
||
print(f" 📄 CSV file size: {len(csv_content)} bytes")
|
||
|
||
return csv_content, modified_file_name, df
|
||
|
||
except FileNotFoundError:
|
||
print(f"❌ Error: File '{file_path}' not found!")
|
||
return None, None, None
|
||
except Exception as e:
|
||
print(f"❌ Error processing file: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return None, None, None
|
||
|
||
def save_clean_dataset(df, file_name):
|
||
"""
|
||
Save the transformed dataset locally
|
||
"""
|
||
csv_file = f"transformed_{file_name}"
|
||
df.to_csv(csv_file, index=False, encoding='utf-8')
|
||
print(f"\n💾 Transformed dataset saved: {csv_file}")
|
||
|
||
excel_file = csv_file.replace('.csv', '.xlsx')
|
||
df.to_excel(excel_file, index=False)
|
||
print(f"💾 Excel version saved: {excel_file}")
|
||
|
||
return csv_file
|
||
|
||
def main():
|
||
"""
|
||
Main function to execute all transformations
|
||
"""
|
||
print("=" * 80)
|
||
print("🚢 Ship Performance Dataset - Complete Transformation")
|
||
print("=" * 80)
|
||
|
||
# Specify the path to your Excel file
|
||
excel_file_path = "C:/Users/Mikes/OneDrive/Pictures/MENA_BUSINESS_DATA/Transformation Schiff Sample File for Predictive analysis.xlsx"
|
||
|
||
# Process and transform the file
|
||
print("\n1️⃣ Reading and transforming Excel file...")
|
||
file_content, modified_file_name, df = read_and_process_file(excel_file_path, max_rows=5000)
|
||
|
||
if file_content is None:
|
||
print("\n❌ Process failed. Please check if the file exists.")
|
||
return
|
||
|
||
# Save locally
|
||
save_clean_dataset(df, modified_file_name)
|
||
|
||
# Save transformation summary
|
||
summary_file = f'transformation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt'
|
||
with open(summary_file, 'w', encoding='utf-8') as f:
|
||
f.write("TRANSFORMATION SUMMARY\n")
|
||
f.write("=" * 50 + "\n\n")
|
||
f.write(f"Original file: {excel_file_path}\n")
|
||
f.write(f"Rows processed: {len(df)}\n")
|
||
f.write(f"Final columns: {len(df.columns)}\n\n")
|
||
f.write("Final columns list:\n")
|
||
for col in df.columns:
|
||
f.write(f" - {col}\n")
|
||
f.write("\n" + "=" * 50 + "\n\n")
|
||
f.write("Key transformations applied:\n")
|
||
f.write(" - Added IsRecurringCustomer flag (based on multiple transactions per Userid)\n")
|
||
f.write(" - Converted DOB to Age (using TransactionDate as reference, not today's date)\n")
|
||
f.write(" - Converted RegistrationDate to DaysSinceRegistration (days between registration and transaction)\n")
|
||
f.write(" - Dropped CreateDate (duplicate of TransactionDate)\n")
|
||
f.write(" - Dynamically cleaned Brand names using prefix matching\n")
|
||
f.write(" - Cleaned SiteName variations\n")
|
||
f.write(" - Merged contact method columns into single ContactMethod field\n")
|
||
f.write(" - Split date columns into Year, Month, TimeOfMonth, Day components\n")
|
||
f.write(" - Removed redundant columns (StoreId, Store, SiteType, etc.)\n")
|
||
|
||
print(f"\n📄 Transformation summary saved: {summary_file}")
|
||
|
||
print("\n" + "=" * 80)
|
||
print("🎉 All transformations completed successfully! إن شاء الله")
|
||
print(f" ✅ {len(df)} rows processed")
|
||
print(f" ✅ {len(df.columns)} columns in final dataset")
|
||
print(" ✅ Recurring customer flag added")
|
||
print(" ✅ DOB converted to Age (using transaction date)")
|
||
print(" ✅ RegistrationDate converted to DaysSinceRegistration")
|
||
print(" ✅ CreateDate dropped (duplicate)")
|
||
print(" ✅ Contact methods merged")
|
||
print(" ✅ Date columns split into components")
|
||
print(" ✅ SiteName and Brand dynamically cleaned")
|
||
print("=" * 80)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|
||
|