الملفات
transform_code/transform.py
2026-04-28 17:54:16 +00:00

801 أسطر
28 KiB
Python
خام اللوم التاريخ

هذا الملف يحتوي على أحرف Unicode غير مرئية

هذا الملف يحتوي على أحرف Unicode غير مرئية لا يمكن التمييز بينها بعين الإنسان ولكن قد تتم معالجتها بشكل مختلف بواسطة الحاسوب. إذا كنت تعتقد أن هذا مقصود، يمكنك تجاهل هذا التحذير بأمان. استخدم زر الهروب للكشف عنها.

هذا الملف يحتوي على أحرف Unicode قد تُخلط مع أحرف أخرى. إذا كنت تعتقد أن هذا مقصود، يمكنك تجاهل هذا التحذير بأمان. استخدم زر الهروب للكشف عنها.

# بسم الله الرحمن الرحيم
# بسم الله الرحمن الرحيم
# بسم الله الرحمن الرحيم
# بسم الله الرحمن الرحيم
import base64
import json
import pandas as pd
import os
import numpy as np
from datetime import datetime
import io
from difflib import SequenceMatcher
def sanitize_text(value):
"""
Sanitize text values to ensure they're UTF-8 compatible
"""
if pd.isna(value):
return ""
if isinstance(value, (int, float, np.integer, np.floating)):
return str(value)
if isinstance(value, str):
try:
return value.encode('utf-8', errors='ignore').decode('utf-8')
except:
cleaned = ''.join(char for char in value if ord(char) < 128 or char.isprintable())
return cleaned
try:
return str(value)
except:
return ""
def clean_site_name(name):
"""
Clean SiteName by standardizing similar values
"""
if pd.isna(name) or name == "":
return "Unknown"
name = str(name).strip().lower()
# Common variations mapping
site_mapping = {
'main': 'Main Site',
'main site': 'Main Site',
'mainstore': 'Main Site',
'main store': 'Main Site',
'north': 'North Site',
'north site': 'North Site',
'northstore': 'North Site',
'south': 'South Site',
'south site': 'South Site',
'southstore': 'South Site',
'east': 'East Site',
'east site': 'East Site',
'west': 'West Site',
'west site': 'West Site',
'central': 'Central Site',
'central site': 'Central Site'
}
for key, value in site_mapping.items():
if key in name:
return value
return name.title()
def create_brand_mapping(series):
"""
Dynamically create brand mapping by analyzing unique brand names
Uses first N characters and similarity matching
"""
# Get unique brand values (excluding nulls and unknowns)
unique_brands = series.dropna().unique()
unique_brands = [str(b).strip() for b in unique_brands if str(b).strip() != "" and str(b).strip().lower() != "unknown"]
# Dictionary to store mappings
brand_map = {}
# First, group by first 3-5 characters
brand_groups = {}
for brand in unique_brands:
brand_lower = brand.lower()
# Try different prefix lengths
for prefix_len in [5, 4, 3]:
if len(brand_lower) >= prefix_len:
prefix = brand_lower[:prefix_len]
if prefix not in brand_groups:
brand_groups[prefix] = []
brand_groups[prefix].append(brand)
break
# For each group, find the most common/canonical name
for prefix, brands in brand_groups.items():
if len(brands) == 1:
# Single brand - use it as is (capitalized)
brand_map[brands[0].lower()] = brands[0].title()
else:
# Multiple brands with same prefix - find the most frequent or common one
# Count occurrences in the original series
brand_counts = series.value_counts()
# Find the brand with highest count in this group
best_match = max(brands, key=lambda b: brand_counts.get(b, 0))
canonical_name = best_match.title()
# Map all variations to the canonical name
for brand in brands:
brand_map[brand.lower()] = canonical_name
# Also check for brands that are substrings of others
sorted_brands = sorted(unique_brands, key=len, reverse=True)
for i, long_brand in enumerate(sorted_brands):
long_lower = long_brand.lower()
for short_brand in sorted_brands[i+1:]:
short_lower = short_brand.lower()
if short_lower in long_lower and len(short_lower) > 3:
# Short brand is a substring of long brand
if short_brand.lower() not in brand_map:
brand_map[short_lower] = long_brand.title()
return brand_map
def clean_brand_dynamic(brand, brand_mapping):
"""
Clean Brand names using dynamic mapping
"""
if pd.isna(brand) or brand == "":
return "Unknown"
brand_str = str(brand).strip()
brand_lower = brand_str.lower()
# Check if we have a mapping for this brand
if brand_lower in brand_mapping:
return brand_mapping[brand_lower]
# Try partial matching using first few characters
for key, value in brand_mapping.items():
# Check if brand starts with the same prefix
if len(brand_lower) >= 3 and len(key) >= 3:
if brand_lower[:3] == key[:3]:
return value
# If not found, return title case
return brand_str.title()
def calculate_age_from_dob(dob_value, transaction_date):
"""
Convert DOB to age based on transaction date, not today's date
Handles 1900-01-01 as Unknown
"""
if pd.isna(dob_value) or dob_value == "":
return "Unknown"
if pd.isna(transaction_date) or transaction_date == "":
return "Unknown"
dob_str = str(dob_value).strip()
# Check for the placeholder date
if dob_str.startswith('1900-01-01') or dob_str.startswith('1900/01/01') or dob_str == '1900-01-01':
return "Unknown"
try:
# Parse DOB
if '-' in dob_str:
dob = pd.to_datetime(dob_str.split()[0])
elif '/' in dob_str:
dob = pd.to_datetime(dob_str)
else:
return "Unknown"
# Parse Transaction Date
trans_date_str = str(transaction_date).strip()
if '-' in trans_date_str:
trans_date = pd.to_datetime(trans_date_str.split()[0])
elif '/' in trans_date_str:
trans_date = pd.to_datetime(trans_date_str)
else:
return "Unknown"
# Calculate age at time of transaction
age = trans_date.year - dob.year - ((trans_date.month, trans_date.day) < (dob.month, dob.day))
if age < 0 or age > 120: # Sanity check
return "Unknown"
return age
except:
return "Unknown"
def calculate_registration_duration(registration_date, transaction_date):
"""
Calculate number of days between registration and transaction
"""
if pd.isna(registration_date) or registration_date == "":
return "Unknown"
if pd.isna(transaction_date) or transaction_date == "":
return "Unknown"
try:
# Parse Registration Date
reg_str = str(registration_date).strip()
if '-' in reg_str:
reg_date = pd.to_datetime(reg_str.split()[0])
elif '/' in reg_str:
reg_date = pd.to_datetime(reg_str)
else:
return "Unknown"
# Parse Transaction Date
trans_str = str(transaction_date).strip()
if '-' in trans_str:
trans_date = pd.to_datetime(trans_str.split()[0])
elif '/' in trans_str:
trans_date = pd.to_datetime(trans_str)
else:
return "Unknown"
# Calculate days difference
days_diff = (trans_date - reg_date).days
if days_diff < 0:
return "0" # Transaction before registration - treat as 0
if days_diff > 3650: # Cap at 10 years (sanity check)
return "3650+"
return days_diff
except:
return "Unknown"
def merge_contact_methods(row):
"""
Merge Email, SMS, Mail, Phone into one column with priority order
"""
contact_methods = []
if row.get('ContactByEmail') == 1 or str(row.get('ContactByEmail', '')).lower() == 'true' or str(row.get('ContactByEmail', '')).lower() == 'yes':
contact_methods.append('Email')
if row.get('ContactBySMS') == 1 or str(row.get('ContactBySMS', '')).lower() == 'true' or str(row.get('ContactBySMS', '')).lower() == 'yes':
contact_methods.append('SMS')
if row.get('ContactByMail') == 1 or str(row.get('ContactByMail', '')).lower() == 'true' or str(row.get('ContactByMail', '')).lower() == 'yes':
contact_methods.append('Mail')
if row.get('ContactByPhone') == 1 or str(row.get('ContactByPhone', '')).lower() == 'true' or str(row.get('ContactByPhone', '')).lower() == 'yes':
contact_methods.append('Phone')
if not contact_methods:
return 'NoContact'
return ','.join(contact_methods) # Return all methods as comma-separated
def extract_date_components(date_value, column_name, reference_date=None):
"""
Extract Year, Month, TimeOfMonth, Day from date
"""
if pd.isna(date_value) or date_value == "":
return {
f'{column_name}_Year': "Unknown",
f'{column_name}_Month': "Unknown",
f'{column_name}_TimeOfMonth': "Unknown",
f'{column_name}_Day': "Unknown"
}
try:
# Parse the date
date_str = str(date_value).strip()
if '-' in date_str:
date_obj = pd.to_datetime(date_str.split()[0])
elif '/' in date_str:
date_obj = pd.to_datetime(date_str)
else:
return {
f'{column_name}_Year': "Unknown",
f'{column_name}_Month': "Unknown",
f'{column_name}_TimeOfMonth': "Unknown",
f'{column_name}_Day': "Unknown"
}
# Extract components
year = date_obj.year
month_names = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
month = month_names[date_obj.month - 1]
day_num = date_obj.day
if 1 <= day_num <= 10:
time_of_month = "Beginning (1-10)"
elif 11 <= day_num <= 20:
time_of_month = "Middle (11-20)"
else:
time_of_month = "End (21-31)"
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day = day_names[date_obj.weekday()]
return {
f'{column_name}_Year': year,
f'{column_name}_Month': month,
f'{column_name}_TimeOfMonth': time_of_month,
f'{column_name}_Day': day
}
except:
return {
f'{column_name}_Year': "Unknown",
f'{column_name}_Month': "Unknown",
f'{column_name}_TimeOfMonth': "Unknown",
f'{column_name}_Day': "Unknown"
}
def add_recurring_customer_flag(df, userid_column='Userid'):
"""
Add a flag indicating if customer is recurring (has multiple transactions)
"""
# Count transactions per user
user_transaction_counts = df[userid_column].value_counts()
# Create flag: 1 if more than 1 transaction, 0 otherwise
df['IsRecurringCustomer'] = df[userid_column].map(
lambda x: 1 if user_transaction_counts.get(x, 0) > 1 else 0
)
print(f" 🔄 Added 'IsRecurringCustomer' flag: {df['IsRecurringCustomer'].sum()} recurring customers out of {df[userid_column].nunique()} unique users")
return df
def transform_dataframe(df):
"""
Apply all transformations to the dataframe
"""
print("\n 🔄 Applying transformations...")
# A > Keep Userid and add recurring customer flag
if 'Userid' in df.columns:
print(" ✅ Keeping 'Userid' and adding recurring customer flag")
df = add_recurring_customer_flag(df, 'Userid')
else:
print(" ⚠️ 'Userid' column not found")
# B > Drop StoreId (same value)
if 'StoreId' in df.columns:
df = df.drop(columns=['StoreId'])
print(" 🗑️ Dropped 'StoreId'")
# C > Drop Store (same value)
if 'Store' in df.columns:
df = df.drop(columns=['Store'])
print(" 🗑️ Dropped 'Store'")
# D > Drop ParentSiteId (same value)
if 'ParentSiteId' in df.columns:
df = df.drop(columns=['ParentSiteId'])
print(" 🗑️ Dropped 'ParentSiteId'")
# E > Drop SiteType (same value)
if 'SiteType' in df.columns:
df = df.drop(columns=['SiteType'])
print(" 🗑️ Dropped 'SiteType'")
# F > Keep Gender
if 'Gender' in df.columns:
print(" ✅ Keeping 'Gender'")
# G > Convert DOB to Age (using TransactionDate as reference)
if 'DOB' in df.columns and 'TransactionDate' in df.columns:
df['Age'] = df.apply(lambda row: calculate_age_from_dob(row['DOB'], row['TransactionDate']), axis=1)
df = df.drop(columns=['DOB'])
print(" ✅ Converted 'DOB' to 'Age' (using TransactionDate as reference)")
elif 'DOB' in df.columns:
print(" ⚠️ 'DOB' found but 'TransactionDate' missing - cannot calculate Age properly")
# H > Convert RegistrationDate to duration (days since registration)
if 'RegistrationDate' in df.columns and 'TransactionDate' in df.columns:
df['DaysSinceRegistration'] = df.apply(lambda row: calculate_registration_duration(row['RegistrationDate'], row['TransactionDate']), axis=1)
df = df.drop(columns=['RegistrationDate'])
print(" ✅ Converted 'RegistrationDate' to 'DaysSinceRegistration' (days between registration and transaction)")
elif 'RegistrationDate' in df.columns:
print(" ⚠️ 'RegistrationDate' found but 'TransactionDate' missing - keeping as-is")
# I > Drop FirstLoginDate
if 'FirstLoginDate' in df.columns:
df = df.drop(columns=['FirstLoginDate'])
print(" 🗑️ Dropped 'FirstLoginDate'")
# J > Drop LastLoginDate
if 'LastLoginDate' in df.columns:
df = df.drop(columns=['LastLoginDate'])
print(" 🗑️ Dropped 'LastLoginDate'")
# K,L,M,N > Merge ContactBy columns
contact_columns = ['ContactByEmail', 'ContactBySMS', 'ContactByMail', 'ContactByPhone']
existing_contact_cols = [col for col in contact_columns if col in df.columns]
if existing_contact_cols:
df['ContactMethod'] = df.apply(merge_contact_methods, axis=1)
df = df.drop(columns=existing_contact_cols)
print(f" ✅ Merged {len(existing_contact_cols)} contact columns into 'ContactMethod'")
# O > Drop ContactStatus
if 'ContactStatus' in df.columns:
df = df.drop(columns=['ContactStatus'])
print(" 🗑️ Dropped 'ContactStatus'")
# P > Drop TermsConsent
if 'TermsConsent' in df.columns:
df = df.drop(columns=['TermsConsent'])
print(" 🗑️ Dropped 'TermsConsent'")
# Q > Drop CommunityName
if 'CommunityName' in df.columns:
df = df.drop(columns=['CommunityName'])
print(" 🗑️ Dropped 'CommunityName'")
# R > Drop CountryId
if 'CountryId' in df.columns:
df = df.drop(columns=['CountryId'])
print(" 🗑️ Dropped 'CountryId'")
# S > Keep Country
if 'Country' in df.columns:
print(" ✅ Keeping 'Country'")
# T > Drop StateCode
if 'StateCode' in df.columns:
df = df.drop(columns=['StateCode'])
print(" 🗑️ Dropped 'StateCode'")
# U > Keep StateName
if 'StateName' in df.columns:
print(" ✅ Keeping 'StateName'")
# V > Drop City
if 'City' in df.columns:
df = df.drop(columns=['City'])
print(" 🗑️ Dropped 'City'")
# W > Drop PostalCode
if 'PostalCode' in df.columns:
df = df.drop(columns=['PostalCode'])
print(" 🗑️ Dropped 'PostalCode'")
# X > Drop Title
if 'Title' in df.columns:
df = df.drop(columns=['Title'])
print(" 🗑️ Dropped 'Title'")
# Y > Drop Salutation
if 'Salutation' in df.columns:
df = df.drop(columns=['Salutation'])
print(" 🗑️ Dropped 'Salutation'")
# Z > Keep R
if 'R' in df.columns:
print(" ✅ Keeping 'R'")
# AA > Keep F
if 'F' in df.columns:
print(" ✅ Keeping 'F'")
# AB > Keep M
if 'M' in df.columns:
print(" ✅ Keeping 'M'")
# AC > Keep RFM
if 'RFM' in df.columns:
print(" ✅ Keeping 'RFM'")
# AD > Keep Tier
if 'Tier' in df.columns:
print(" ✅ Keeping 'Tier'")
# AE > Convert TransactionDate into date components
if 'TransactionDate' in df.columns:
date_components = df['TransactionDate'].apply(lambda x: extract_date_components(x, 'Transaction'))
date_df = pd.DataFrame(date_components.tolist())
df = pd.concat([df, date_df], axis=1)
df = df.drop(columns=['TransactionDate'])
print(" ✅ Converted 'TransactionDate' into 4 columns (Transaction_Year, Transaction_Month, Transaction_TimeOfMonth, Transaction_Day)")
# AF > Drop CreateDate (as requested - it's the same as TransactionDate)
if 'CreateDate' in df.columns:
df = df.drop(columns=['CreateDate'])
print(" 🗑️ Dropped 'CreateDate' (duplicate of TransactionDate)")
# AG > Drop MemberId
if 'MemberId' in df.columns:
df = df.drop(columns=['MemberId'])
print(" 🗑️ Dropped 'MemberId'")
# AH > Drop SiteId
if 'SiteId' in df.columns:
df = df.drop(columns=['SiteId'])
print(" 🗑️ Dropped 'SiteId'")
# AI > Clean and keep SiteName
if 'SiteName' in df.columns:
df['SiteName'] = df['SiteName'].apply(clean_site_name)
print(" ✅ Kept and cleaned 'SiteName'")
# AJ > Keep Quantity
if 'Quantity' in df.columns:
print(" ✅ Keeping 'Quantity'")
# AK > Keep Amount
if 'Amount' in df.columns:
print(" ✅ Keeping 'Amount'")
# AL > Drop RewardType
if 'RewardType' in df.columns:
df = df.drop(columns=['RewardType'])
print(" 🗑️ Dropped 'RewardType'")
# AM > Keep Points
if 'Points' in df.columns:
print(" ✅ Keeping 'Points'")
# AN > Drop trxDetailId
if 'trxDetailId' in df.columns:
df = df.drop(columns=['trxDetailId'])
print(" 🗑️ Dropped 'trxDetailId'")
# AO > Drop TrxId
if 'TrxId' in df.columns:
df = df.drop(columns=['TrxId'])
print(" 🗑️ Dropped 'TrxId'")
# AP > Drop TransactionStatusId
if 'TransactionStatusId' in df.columns:
df = df.drop(columns=['TransactionStatusId'])
print(" 🗑️ Dropped 'TransactionStatusId'")
# AQ > Keep TransactionStatusName
if 'TransactionStatusName' in df.columns:
print(" ✅ Keeping 'TransactionStatusName'")
# AR > Drop TransactionTypeId
if 'TransactionTypeId' in df.columns:
df = df.drop(columns=['TransactionTypeId'])
print(" 🗑️ Dropped 'TransactionTypeId'")
# AS > Keep TransactionTypeName
if 'TransactionTypeName' in df.columns:
print(" ✅ Keeping 'TransactionTypeName'")
# AT > Drop Reportable
if 'Reportable' in df.columns:
df = df.drop(columns=['Reportable'])
print(" 🗑️ Dropped 'Reportable'")
# AU > Keep TransactionItemCode
if 'TransactionItemCode' in df.columns:
print(" ✅ Keeping 'TransactionItemCode'")
# AV > Keep AnalysisCode1
if 'AnalysisCode1' in df.columns:
print(" ✅ Keeping 'AnalysisCode1'")
# AW > Keep AnalysisCode2
if 'AnalysisCode2' in df.columns:
print(" ✅ Keeping 'AnalysisCode2'")
# AX > Keep AnalysisCode3
if 'AnalysisCode3' in df.columns:
print(" ✅ Keeping 'AnalysisCode3'")
# AY > Keep AnalysisCode4
if 'AnalysisCode4' in df.columns:
print(" ✅ Keeping 'AnalysisCode4'")
# AZ > Dynamically clean Brand
if 'Brand' in df.columns:
print(" 🔍 Analyzing unique brand names to create dynamic mapping...")
brand_mapping = create_brand_mapping(df['Brand'])
print(f" 📊 Created mapping for {len(brand_mapping)} unique brand variations")
df['Brand'] = df['Brand'].apply(lambda x: clean_brand_dynamic(x, brand_mapping))
print(" ✅ Kept and dynamically cleaned 'Brand'")
# BA > Keep AnalysisCode6
if 'AnalysisCode6' in df.columns:
print(" ✅ Keeping 'AnalysisCode6'")
# BB > Keep AnalysisCode7
if 'AnalysisCode7' in df.columns:
print(" ✅ Keeping 'AnalysisCode7'")
# BC > Keep AnalysisCode8
if 'AnalysisCode8' in df.columns:
print(" ✅ Keeping 'AnalysisCode8'")
# BD > Keep Price
if 'Price' in df.columns:
print(" ✅ Keeping 'Price'")
# BE > Keep AnalysisCode10
if 'AnalysisCode10' in df.columns:
print(" ✅ Keeping 'AnalysisCode10'")
# BF > Keep InvalidReason
if 'InvalidReason' in df.columns:
print(" ✅ Keeping 'InvalidReason'")
# BG > Drop Description
if 'Description' in df.columns:
df = df.drop(columns=['Description'])
print(" 🗑️ Dropped 'Description'")
# BH > Drop PromotionId
if 'PromotionId' in df.columns:
df = df.drop(columns=['PromotionId'])
print(" 🗑️ Dropped 'PromotionId'")
# BI > Keep PromotionName
if 'PromotionName' in df.columns:
print(" ✅ Keeping 'PromotionName'")
# BJ > Convert PromotionStartDate into 4 columns
if 'PromotionStartDate' in df.columns:
date_components = df['PromotionStartDate'].apply(lambda x: extract_date_components(x, 'PromotionStart'))
date_df = pd.DataFrame(date_components.tolist())
df = pd.concat([df, date_df], axis=1)
df = df.drop(columns=['PromotionStartDate'])
print(" ✅ Converted 'PromotionStartDate' into 4 columns (PromotionStart_Year, PromotionStart_Month, PromotionStart_TimeOfMonth, PromotionStart_Day)")
# BK > Drop PromotionEndDate
if 'PromotionEndDate' in df.columns:
df = df.drop(columns=['PromotionEndDate'])
print(" 🗑️ Dropped 'PromotionEndDate'")
# BL > Drop PromotionOfferTypeId
if 'PromotionOfferTypeId' in df.columns:
df = df.drop(columns=['PromotionOfferTypeId'])
print(" 🗑️ Dropped 'PromotionOfferTypeId'")
# BM > Drop PromotionOfferTypeName
if 'PromotionOfferTypeName' in df.columns:
df = df.drop(columns=['PromotionOfferTypeName'])
print(" 🗑️ Dropped 'PromotionOfferTypeName'")
# BN > Drop PromotionSiteId
if 'PromotionSiteId' in df.columns:
df = df.drop(columns=['PromotionSiteId'])
print(" 🗑️ Dropped 'PromotionSiteId'")
# BO > Drop PromotionSite
if 'PromotionSite' in df.columns:
df = df.drop(columns=['PromotionSite'])
print(" 🗑️ Dropped 'PromotionSite'")
# BP > Drop QualifyingProductQuantity
if 'QualifyingProductQuantity' in df.columns:
df = df.drop(columns=['QualifyingProductQuantity'])
print(" 🗑️ Dropped 'QualifyingProductQuantity'")
print("\n ✅ All transformations completed!")
return df
def read_and_process_file(file_path, max_rows=5000):
"""
Read the Excel file and apply all transformations
"""
try:
print(f" 📖 Reading file: {file_path}")
# Read the Excel file
df = pd.read_excel(file_path)
print(f" 📊 Original columns: {list(df.columns)}")
print(f" 📏 Original shape: {df.shape}")
# Limit to first max_rows
original_row_count = len(df)
if len(df) > max_rows:
df = df.head(max_rows)
print(f" ✂️ Limited dataset to first {max_rows} rows (from {original_row_count} total rows)")
else:
print(f" Dataset has {len(df)} rows (within the {max_rows} row limit)")
# Apply all transformations
df = transform_dataframe(df)
# Sanitize all text data (final pass)
print("\n 🧹 Final sanitization of text data...")
for col in df.columns:
if df[col].dtype == 'object': # Only process string columns
df[col] = df[col].apply(sanitize_text)
# Convert DataFrame to CSV
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False, encoding='utf-8')
csv_content = csv_buffer.getvalue().encode('utf-8')
# Get original file name and create modified name
original_file_name = os.path.basename(file_path)
name, ext = os.path.splitext(original_file_name)
modified_file_name = f"{name}_transformed_{len(df)}_rows.csv"
print(f"\n ✅ Successfully processed file: {modified_file_name}")
print(f" 📊 Final columns: {list(df.columns)}")
print(f" 📏 Final shape: {df.shape}")
print(f" 📄 CSV file size: {len(csv_content)} bytes")
return csv_content, modified_file_name, df
except FileNotFoundError:
print(f"❌ Error: File '{file_path}' not found!")
return None, None, None
except Exception as e:
print(f"❌ Error processing file: {e}")
import traceback
traceback.print_exc()
return None, None, None
def save_clean_dataset(df, file_name):
"""
Save the transformed dataset locally
"""
csv_file = f"transformed_{file_name}"
df.to_csv(csv_file, index=False, encoding='utf-8')
print(f"\n💾 Transformed dataset saved: {csv_file}")
excel_file = csv_file.replace('.csv', '.xlsx')
df.to_excel(excel_file, index=False)
print(f"💾 Excel version saved: {excel_file}")
return csv_file
def main():
"""
Main function to execute all transformations
"""
print("=" * 80)
print("🚢 Ship Performance Dataset - Complete Transformation")
print("=" * 80)
# Specify the path to your Excel file
excel_file_path = "C:/Users/Mikes/OneDrive/Pictures/MENA_BUSINESS_DATA/Transformation Schiff Sample File for Predictive analysis.xlsx"
# Process and transform the file
print("\n1⃣ Reading and transforming Excel file...")
file_content, modified_file_name, df = read_and_process_file(excel_file_path, max_rows=5000)
if file_content is None:
print("\n❌ Process failed. Please check if the file exists.")
return
# Save locally
save_clean_dataset(df, modified_file_name)
# Save transformation summary
summary_file = f'transformation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt'
with open(summary_file, 'w', encoding='utf-8') as f:
f.write("TRANSFORMATION SUMMARY\n")
f.write("=" * 50 + "\n\n")
f.write(f"Original file: {excel_file_path}\n")
f.write(f"Rows processed: {len(df)}\n")
f.write(f"Final columns: {len(df.columns)}\n\n")
f.write("Final columns list:\n")
for col in df.columns:
f.write(f" - {col}\n")
f.write("\n" + "=" * 50 + "\n\n")
f.write("Key transformations applied:\n")
f.write(" - Added IsRecurringCustomer flag (based on multiple transactions per Userid)\n")
f.write(" - Converted DOB to Age (using TransactionDate as reference, not today's date)\n")
f.write(" - Converted RegistrationDate to DaysSinceRegistration (days between registration and transaction)\n")
f.write(" - Dropped CreateDate (duplicate of TransactionDate)\n")
f.write(" - Dynamically cleaned Brand names using prefix matching\n")
f.write(" - Cleaned SiteName variations\n")
f.write(" - Merged contact method columns into single ContactMethod field\n")
f.write(" - Split date columns into Year, Month, TimeOfMonth, Day components\n")
f.write(" - Removed redundant columns (StoreId, Store, SiteType, etc.)\n")
print(f"\n📄 Transformation summary saved: {summary_file}")
print("\n" + "=" * 80)
print("🎉 All transformations completed successfully! إن شاء الله")
print(f"{len(df)} rows processed")
print(f"{len(df.columns)} columns in final dataset")
print(" ✅ Recurring customer flag added")
print(" ✅ DOB converted to Age (using transaction date)")
print(" ✅ RegistrationDate converted to DaysSinceRegistration")
print(" ✅ CreateDate dropped (duplicate)")
print(" ✅ Contact methods merged")
print(" ✅ Date columns split into components")
print(" ✅ SiteName and Brand dynamically cleaned")
print("=" * 80)
if __name__ == "__main__":
main()