transform_code/transform.py

# بسم الله الرحمن الرحيم

# بسم الله الرحمن الرحيم

# بسم الله الرحمن الرحيم

# بسم الله الرحمن الرحيم

import base64
import json
import pandas as pd
import os
import numpy as np
from datetime import datetime
import io
from difflib import SequenceMatcher

def sanitize_text(value):
    """
    Sanitize text values to ensure they're UTF-8 compatible
    """
    if pd.isna(value):
        return ""

    if isinstance(value, (int, float, np.integer, np.floating)):
        return str(value)

    if isinstance(value, str):
        try:
            return value.encode('utf-8', errors='ignore').decode('utf-8')
        except:
            cleaned = ''.join(char for char in value if ord(char) < 128 or char.isprintable())
            return cleaned

    try:
        return str(value)
    except:
        return ""

def clean_site_name(name):
    """
    Clean SiteName by standardizing similar values
    """
    if pd.isna(name) or name == "":
        return "Unknown"

    name = str(name).strip().lower()

    # Common variations mapping
    site_mapping = {
        'main': 'Main Site',
        'main site': 'Main Site',
        'mainstore': 'Main Site',
        'main store': 'Main Site',
        'north': 'North Site',
        'north site': 'North Site',
        'northstore': 'North Site',
        'south': 'South Site',
        'south site': 'South Site',
        'southstore': 'South Site',
        'east': 'East Site',
        'east site': 'East Site',
        'west': 'West Site',
        'west site': 'West Site',
        'central': 'Central Site',
        'central site': 'Central Site'
    }

    for key, value in site_mapping.items():
        if key in name:
            return value

    return name.title()

def create_brand_mapping(series):
    """
    Dynamically create brand mapping by analyzing unique brand names
    Uses first N characters and similarity matching
    """
    # Get unique brand values (excluding nulls and unknowns)
    unique_brands = series.dropna().unique()
    unique_brands = [str(b).strip() for b in unique_brands if str(b).strip() != "" and str(b).strip().lower() != "unknown"]

    # Dictionary to store mappings
    brand_map = {}

    # First, group by first 3-5 characters
    brand_groups = {}

    for brand in unique_brands:
        brand_lower = brand.lower()

        # Try different prefix lengths
        for prefix_len in [5, 4, 3]:
            if len(brand_lower) >= prefix_len:
                prefix = brand_lower[:prefix_len]
                if prefix not in brand_groups:
                    brand_groups[prefix] = []
                brand_groups[prefix].append(brand)
                break

    # For each group, find the most common/canonical name
    for prefix, brands in brand_groups.items():
        if len(brands) == 1:
            # Single brand - use it as is (capitalized)
            brand_map[brands[0].lower()] = brands[0].title()
        else:
            # Multiple brands with same prefix - find the most frequent or common one
            # Count occurrences in the original series
            brand_counts = series.value_counts()

            # Find the brand with highest count in this group
            best_match = max(brands, key=lambda b: brand_counts.get(b, 0))
            canonical_name = best_match.title()

            # Map all variations to the canonical name
            for brand in brands:
                brand_map[brand.lower()] = canonical_name

    # Also check for brands that are substrings of others
    sorted_brands = sorted(unique_brands, key=len, reverse=True)
    for i, long_brand in enumerate(sorted_brands):
        long_lower = long_brand.lower()
        for short_brand in sorted_brands[i+1:]:
            short_lower = short_brand.lower()
            if short_lower in long_lower and len(short_lower) > 3:
                # Short brand is a substring of long brand
                if short_brand.lower() not in brand_map:
                    brand_map[short_lower] = long_brand.title()

    return brand_map

def clean_brand_dynamic(brand, brand_mapping):
    """
    Clean Brand names using dynamic mapping
    """
    if pd.isna(brand) or brand == "":
        return "Unknown"

    brand_str = str(brand).strip()
    brand_lower = brand_str.lower()

    # Check if we have a mapping for this brand
    if brand_lower in brand_mapping:
        return brand_mapping[brand_lower]

    # Try partial matching using first few characters
    for key, value in brand_mapping.items():
        # Check if brand starts with the same prefix
        if len(brand_lower) >= 3 and len(key) >= 3:
            if brand_lower[:3] == key[:3]:
                return value

    # If not found, return title case
    return brand_str.title()

def calculate_age_from_dob(dob_value, transaction_date):
    """
    Convert DOB to age based on transaction date, not today's date
    Handles 1900-01-01 as Unknown
    """
    if pd.isna(dob_value) or dob_value == "":
        return "Unknown"

    if pd.isna(transaction_date) or transaction_date == "":
        return "Unknown"

    dob_str = str(dob_value).strip()

    # Check for the placeholder date
    if dob_str.startswith('1900-01-01') or dob_str.startswith('1900/01/01') or dob_str == '1900-01-01':
        return "Unknown"

    try:
        # Parse DOB
        if '-' in dob_str:
            dob = pd.to_datetime(dob_str.split()[0])
        elif '/' in dob_str:
            dob = pd.to_datetime(dob_str)
        else:
            return "Unknown"

        # Parse Transaction Date
        trans_date_str = str(transaction_date).strip()
        if '-' in trans_date_str:
            trans_date = pd.to_datetime(trans_date_str.split()[0])
        elif '/' in trans_date_str:
            trans_date = pd.to_datetime(trans_date_str)
        else:
            return "Unknown"

        # Calculate age at time of transaction
        age = trans_date.year - dob.year - ((trans_date.month, trans_date.day) < (dob.month, dob.day))

        if age < 0 or age > 120:  # Sanity check
            return "Unknown"

        return age
    except:
        return "Unknown"

def calculate_registration_duration(registration_date, transaction_date):
    """
    Calculate number of days between registration and transaction
    """
    if pd.isna(registration_date) or registration_date == "":
        return "Unknown"

    if pd.isna(transaction_date) or transaction_date == "":
        return "Unknown"

    try:
        # Parse Registration Date
        reg_str = str(registration_date).strip()
        if '-' in reg_str:
            reg_date = pd.to_datetime(reg_str.split()[0])
        elif '/' in reg_str:
            reg_date = pd.to_datetime(reg_str)
        else:
            return "Unknown"

        # Parse Transaction Date
        trans_str = str(transaction_date).strip()
        if '-' in trans_str:
            trans_date = pd.to_datetime(trans_str.split()[0])
        elif '/' in trans_str:
            trans_date = pd.to_datetime(trans_str)
        else:
            return "Unknown"

        # Calculate days difference
        days_diff = (trans_date - reg_date).days

        if days_diff < 0:
            return "0"  # Transaction before registration - treat as 0

        if days_diff > 3650:  # Cap at 10 years (sanity check)
            return "3650+"

        return days_diff
    except:
        return "Unknown"

def merge_contact_methods(row):
    """
    Merge Email, SMS, Mail, Phone into one column with priority order
    """
    contact_methods = []

    if row.get('ContactByEmail') == 1 or str(row.get('ContactByEmail', '')).lower() == 'true' or str(row.get('ContactByEmail', '')).lower() == 'yes':
        contact_methods.append('Email')
    if row.get('ContactBySMS') == 1 or str(row.get('ContactBySMS', '')).lower() == 'true' or str(row.get('ContactBySMS', '')).lower() == 'yes':
        contact_methods.append('SMS')
    if row.get('ContactByMail') == 1 or str(row.get('ContactByMail', '')).lower() == 'true' or str(row.get('ContactByMail', '')).lower() == 'yes':
        contact_methods.append('Mail')
    if row.get('ContactByPhone') == 1 or str(row.get('ContactByPhone', '')).lower() == 'true' or str(row.get('ContactByPhone', '')).lower() == 'yes':
        contact_methods.append('Phone')

    if not contact_methods:
        return 'NoContact'

    return ','.join(contact_methods)  # Return all methods as comma-separated

def extract_date_components(date_value, column_name, reference_date=None):
    """
    Extract Year, Month, TimeOfMonth, Day from date
    """
    if pd.isna(date_value) or date_value == "":
        return {
            f'{column_name}_Year': "Unknown",
            f'{column_name}_Month': "Unknown",
            f'{column_name}_TimeOfMonth': "Unknown",
            f'{column_name}_Day': "Unknown"
        }

    try:
        # Parse the date
        date_str = str(date_value).strip()
        if '-' in date_str:
            date_obj = pd.to_datetime(date_str.split()[0])
        elif '/' in date_str:
            date_obj = pd.to_datetime(date_str)
        else:
            return {
                f'{column_name}_Year': "Unknown",
                f'{column_name}_Month': "Unknown",
                f'{column_name}_TimeOfMonth': "Unknown",
                f'{column_name}_Day': "Unknown"
            }

        # Extract components
        year = date_obj.year

        month_names = ['January', 'February', 'March', 'April', 'May', 'June',
                      'July', 'August', 'September', 'October', 'November', 'December']
        month = month_names[date_obj.month - 1]

        day_num = date_obj.day
        if 1 <= day_num <= 10:
            time_of_month = "Beginning (1-10)"
        elif 11 <= day_num <= 20:
            time_of_month = "Middle (11-20)"
        else:
            time_of_month = "End (21-31)"

        day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        day = day_names[date_obj.weekday()]

        return {
            f'{column_name}_Year': year,
            f'{column_name}_Month': month,
            f'{column_name}_TimeOfMonth': time_of_month,
            f'{column_name}_Day': day
        }
    except:
        return {
            f'{column_name}_Year': "Unknown",
            f'{column_name}_Month': "Unknown",
            f'{column_name}_TimeOfMonth': "Unknown",
            f'{column_name}_Day': "Unknown"
        }

def add_recurring_customer_flag(df, userid_column='Userid'):
    """
    Add a flag indicating if customer is recurring (has multiple transactions)
    """
    # Count transactions per user
    user_transaction_counts = df[userid_column].value_counts()

    # Create flag: 1 if more than 1 transaction, 0 otherwise
    df['IsRecurringCustomer'] = df[userid_column].map(
        lambda x: 1 if user_transaction_counts.get(x, 0) > 1 else 0
    )

    print(f"   🔄 Added 'IsRecurringCustomer' flag: {df['IsRecurringCustomer'].sum()} recurring customers out of {df[userid_column].nunique()} unique users")

    return df

def transform_dataframe(df):
    """
    Apply all transformations to the dataframe
    """
    print("\n   🔄 Applying transformations...")

    # A > Keep Userid and add recurring customer flag
    if 'Userid' in df.columns:
        print("   ✅ Keeping 'Userid' and adding recurring customer flag")
        df = add_recurring_customer_flag(df, 'Userid')
    else:
        print("   ⚠️ 'Userid' column not found")

    # B > Drop StoreId (same value)
    if 'StoreId' in df.columns:
        df = df.drop(columns=['StoreId'])
        print("   🗑️ Dropped 'StoreId'")

    # C > Drop Store (same value)
    if 'Store' in df.columns:
        df = df.drop(columns=['Store'])
        print("   🗑️ Dropped 'Store'")

    # D > Drop ParentSiteId (same value)
    if 'ParentSiteId' in df.columns:
        df = df.drop(columns=['ParentSiteId'])
        print("   🗑️ Dropped 'ParentSiteId'")

    # E > Drop SiteType (same value)
    if 'SiteType' in df.columns:
        df = df.drop(columns=['SiteType'])
        print("   🗑️ Dropped 'SiteType'")

    # F > Keep Gender
    if 'Gender' in df.columns:
        print("   ✅ Keeping 'Gender'")

    # G > Convert DOB to Age (using TransactionDate as reference)
    if 'DOB' in df.columns and 'TransactionDate' in df.columns:
        df['Age'] = df.apply(lambda row: calculate_age_from_dob(row['DOB'], row['TransactionDate']), axis=1)
        df = df.drop(columns=['DOB'])
        print("   ✅ Converted 'DOB' to 'Age' (using TransactionDate as reference)")
    elif 'DOB' in df.columns:
        print("   ⚠️ 'DOB' found but 'TransactionDate' missing - cannot calculate Age properly")

    # H > Convert RegistrationDate to duration (days since registration)
    if 'RegistrationDate' in df.columns and 'TransactionDate' in df.columns:
        df['DaysSinceRegistration'] = df.apply(lambda row: calculate_registration_duration(row['RegistrationDate'], row['TransactionDate']), axis=1)
        df = df.drop(columns=['RegistrationDate'])
        print("   ✅ Converted 'RegistrationDate' to 'DaysSinceRegistration' (days between registration and transaction)")
    elif 'RegistrationDate' in df.columns:
        print("   ⚠️ 'RegistrationDate' found but 'TransactionDate' missing - keeping as-is")

    # I > Drop FirstLoginDate
    if 'FirstLoginDate' in df.columns:
        df = df.drop(columns=['FirstLoginDate'])
        print("   🗑️ Dropped 'FirstLoginDate'")

    # J > Drop LastLoginDate
    if 'LastLoginDate' in df.columns:
        df = df.drop(columns=['LastLoginDate'])
        print("   🗑️ Dropped 'LastLoginDate'")

    # K,L,M,N > Merge ContactBy columns
    contact_columns = ['ContactByEmail', 'ContactBySMS', 'ContactByMail', 'ContactByPhone']
    existing_contact_cols = [col for col in contact_columns if col in df.columns]
    if existing_contact_cols:
        df['ContactMethod'] = df.apply(merge_contact_methods, axis=1)
        df = df.drop(columns=existing_contact_cols)
        print(f"   ✅ Merged {len(existing_contact_cols)} contact columns into 'ContactMethod'")

    # O > Drop ContactStatus
    if 'ContactStatus' in df.columns:
        df = df.drop(columns=['ContactStatus'])
        print("   🗑️ Dropped 'ContactStatus'")

    # P > Drop TermsConsent
    if 'TermsConsent' in df.columns:
        df = df.drop(columns=['TermsConsent'])
        print("   🗑️ Dropped 'TermsConsent'")

    # Q > Drop CommunityName
    if 'CommunityName' in df.columns:
        df = df.drop(columns=['CommunityName'])
        print("   🗑️ Dropped 'CommunityName'")

    # R > Drop CountryId
    if 'CountryId' in df.columns:
        df = df.drop(columns=['CountryId'])
        print("   🗑️ Dropped 'CountryId'")

    # S > Keep Country
    if 'Country' in df.columns:
        print("   ✅ Keeping 'Country'")

    # T > Drop StateCode
    if 'StateCode' in df.columns:
        df = df.drop(columns=['StateCode'])
        print("   🗑️ Dropped 'StateCode'")

    # U > Keep StateName
    if 'StateName' in df.columns:
        print("   ✅ Keeping 'StateName'")

    # V > Drop City
    if 'City' in df.columns:
        df = df.drop(columns=['City'])
        print("   🗑️ Dropped 'City'")

    # W > Drop PostalCode
    if 'PostalCode' in df.columns:
        df = df.drop(columns=['PostalCode'])
        print("   🗑️ Dropped 'PostalCode'")

    # X > Drop Title
    if 'Title' in df.columns:
        df = df.drop(columns=['Title'])
        print("   🗑️ Dropped 'Title'")

    # Y > Drop Salutation
    if 'Salutation' in df.columns:
        df = df.drop(columns=['Salutation'])
        print("   🗑️ Dropped 'Salutation'")

    # Z > Keep R
    if 'R' in df.columns:
        print("   ✅ Keeping 'R'")

    # AA > Keep F
    if 'F' in df.columns:
        print("   ✅ Keeping 'F'")

    # AB > Keep M
    if 'M' in df.columns:
        print("   ✅ Keeping 'M'")

    # AC > Keep RFM
    if 'RFM' in df.columns:
        print("   ✅ Keeping 'RFM'")

    # AD > Keep Tier
    if 'Tier' in df.columns:
        print("   ✅ Keeping 'Tier'")

    # AE > Convert TransactionDate into date components
    if 'TransactionDate' in df.columns:
        date_components = df['TransactionDate'].apply(lambda x: extract_date_components(x, 'Transaction'))
        date_df = pd.DataFrame(date_components.tolist())
        df = pd.concat([df, date_df], axis=1)
        df = df.drop(columns=['TransactionDate'])
        print("   ✅ Converted 'TransactionDate' into 4 columns (Transaction_Year, Transaction_Month, Transaction_TimeOfMonth, Transaction_Day)")

    # AF > Drop CreateDate (as requested - it's the same as TransactionDate)
    if 'CreateDate' in df.columns:
        df = df.drop(columns=['CreateDate'])
        print("   🗑️ Dropped 'CreateDate' (duplicate of TransactionDate)")

    # AG > Drop MemberId
    if 'MemberId' in df.columns:
        df = df.drop(columns=['MemberId'])
        print("   🗑️ Dropped 'MemberId'")

    # AH > Drop SiteId
    if 'SiteId' in df.columns:
        df = df.drop(columns=['SiteId'])
        print("   🗑️ Dropped 'SiteId'")

    # AI > Clean and keep SiteName
    if 'SiteName' in df.columns:
        df['SiteName'] = df['SiteName'].apply(clean_site_name)
        print("   ✅ Kept and cleaned 'SiteName'")

    # AJ > Keep Quantity
    if 'Quantity' in df.columns:
        print("   ✅ Keeping 'Quantity'")

    # AK > Keep Amount
    if 'Amount' in df.columns:
        print("   ✅ Keeping 'Amount'")

    # AL > Drop RewardType
    if 'RewardType' in df.columns:
        df = df.drop(columns=['RewardType'])
        print("   🗑️ Dropped 'RewardType'")

    # AM > Keep Points
    if 'Points' in df.columns:
        print("   ✅ Keeping 'Points'")

    # AN > Drop trxDetailId
    if 'trxDetailId' in df.columns:
        df = df.drop(columns=['trxDetailId'])
        print("   🗑️ Dropped 'trxDetailId'")

    # AO > Drop TrxId
    if 'TrxId' in df.columns:
        df = df.drop(columns=['TrxId'])
        print("   🗑️ Dropped 'TrxId'")

    # AP > Drop TransactionStatusId
    if 'TransactionStatusId' in df.columns:
        df = df.drop(columns=['TransactionStatusId'])
        print("   🗑️ Dropped 'TransactionStatusId'")

    # AQ > Keep TransactionStatusName
    if 'TransactionStatusName' in df.columns:
        print("   ✅ Keeping 'TransactionStatusName'")

    # AR > Drop TransactionTypeId
    if 'TransactionTypeId' in df.columns:
        df = df.drop(columns=['TransactionTypeId'])
        print("   🗑️ Dropped 'TransactionTypeId'")

    # AS > Keep TransactionTypeName
    if 'TransactionTypeName' in df.columns:
        print("   ✅ Keeping 'TransactionTypeName'")

    # AT > Drop Reportable
    if 'Reportable' in df.columns:
        df = df.drop(columns=['Reportable'])
        print("   🗑️ Dropped 'Reportable'")

    # AU > Keep TransactionItemCode
    if 'TransactionItemCode' in df.columns:
        print("   ✅ Keeping 'TransactionItemCode'")

    # AV > Keep AnalysisCode1
    if 'AnalysisCode1' in df.columns:
        print("   ✅ Keeping 'AnalysisCode1'")

    # AW > Keep AnalysisCode2
    if 'AnalysisCode2' in df.columns:
        print("   ✅ Keeping 'AnalysisCode2'")

    # AX > Keep AnalysisCode3
    if 'AnalysisCode3' in df.columns:
        print("   ✅ Keeping 'AnalysisCode3'")

    # AY > Keep AnalysisCode4
    if 'AnalysisCode4' in df.columns:
        print("   ✅ Keeping 'AnalysisCode4'")

    # AZ > Dynamically clean Brand
    if 'Brand' in df.columns:
        print("   🔍 Analyzing unique brand names to create dynamic mapping...")
        brand_mapping = create_brand_mapping(df['Brand'])
        print(f"   📊 Created mapping for {len(brand_mapping)} unique brand variations")
        df['Brand'] = df['Brand'].apply(lambda x: clean_brand_dynamic(x, brand_mapping))
        print("   ✅ Kept and dynamically cleaned 'Brand'")

    # BA > Keep AnalysisCode6
    if 'AnalysisCode6' in df.columns:
        print("   ✅ Keeping 'AnalysisCode6'")

    # BB > Keep AnalysisCode7
    if 'AnalysisCode7' in df.columns:
        print("   ✅ Keeping 'AnalysisCode7'")

    # BC > Keep AnalysisCode8
    if 'AnalysisCode8' in df.columns:
        print("   ✅ Keeping 'AnalysisCode8'")

    # BD > Keep Price
    if 'Price' in df.columns:
        print("   ✅ Keeping 'Price'")

    # BE > Keep AnalysisCode10
    if 'AnalysisCode10' in df.columns:
        print("   ✅ Keeping 'AnalysisCode10'")

    # BF > Keep InvalidReason
    if 'InvalidReason' in df.columns:
        print("   ✅ Keeping 'InvalidReason'")

    # BG > Drop Description
    if 'Description' in df.columns:
        df = df.drop(columns=['Description'])
        print("   🗑️ Dropped 'Description'")

    # BH > Drop PromotionId
    if 'PromotionId' in df.columns:
        df = df.drop(columns=['PromotionId'])
        print("   🗑️ Dropped 'PromotionId'")

    # BI > Keep PromotionName
    if 'PromotionName' in df.columns:
        print("   ✅ Keeping 'PromotionName'")

    # BJ > Convert PromotionStartDate into 4 columns
    if 'PromotionStartDate' in df.columns:
        date_components = df['PromotionStartDate'].apply(lambda x: extract_date_components(x, 'PromotionStart'))
        date_df = pd.DataFrame(date_components.tolist())
        df = pd.concat([df, date_df], axis=1)
        df = df.drop(columns=['PromotionStartDate'])
        print("   ✅ Converted 'PromotionStartDate' into 4 columns (PromotionStart_Year, PromotionStart_Month, PromotionStart_TimeOfMonth, PromotionStart_Day)")

    # BK > Drop PromotionEndDate
    if 'PromotionEndDate' in df.columns:
        df = df.drop(columns=['PromotionEndDate'])
        print("   🗑️ Dropped 'PromotionEndDate'")

    # BL > Drop PromotionOfferTypeId
    if 'PromotionOfferTypeId' in df.columns:
        df = df.drop(columns=['PromotionOfferTypeId'])
        print("   🗑️ Dropped 'PromotionOfferTypeId'")

    # BM > Drop PromotionOfferTypeName
    if 'PromotionOfferTypeName' in df.columns:
        df = df.drop(columns=['PromotionOfferTypeName'])
        print("   🗑️ Dropped 'PromotionOfferTypeName'")

    # BN > Drop PromotionSiteId
    if 'PromotionSiteId' in df.columns:
        df = df.drop(columns=['PromotionSiteId'])
        print("   🗑️ Dropped 'PromotionSiteId'")

    # BO > Drop PromotionSite
    if 'PromotionSite' in df.columns:
        df = df.drop(columns=['PromotionSite'])
        print("   🗑️ Dropped 'PromotionSite'")

    # BP > Drop QualifyingProductQuantity
    if 'QualifyingProductQuantity' in df.columns:
        df = df.drop(columns=['QualifyingProductQuantity'])
        print("   🗑️ Dropped 'QualifyingProductQuantity'")

    print("\n   ✅ All transformations completed!")
    return df

def read_and_process_file(file_path, max_rows=5000):
    """
    Read the Excel file and apply all transformations
    """
    try:
        print(f"   📖 Reading file: {file_path}")

        # Read the Excel file
        df = pd.read_excel(file_path)

        print(f"   📊 Original columns: {list(df.columns)}")
        print(f"   📏 Original shape: {df.shape}")

        # Limit to first max_rows
        original_row_count = len(df)
        if len(df) > max_rows:
            df = df.head(max_rows)
            print(f"   ✂️ Limited dataset to first {max_rows} rows (from {original_row_count} total rows)")
        else:
            print(f"   ℹ️ Dataset has {len(df)} rows (within the {max_rows} row limit)")

        # Apply all transformations
        df = transform_dataframe(df)

        # Sanitize all text data (final pass)
        print("\n   🧹 Final sanitization of text data...")
        for col in df.columns:
            if df[col].dtype == 'object':  # Only process string columns
                df[col] = df[col].apply(sanitize_text)

        # Convert DataFrame to CSV
        csv_buffer = io.StringIO()
        df.to_csv(csv_buffer, index=False, encoding='utf-8')
        csv_content = csv_buffer.getvalue().encode('utf-8')

        # Get original file name and create modified name
        original_file_name = os.path.basename(file_path)
        name, ext = os.path.splitext(original_file_name)
        modified_file_name = f"{name}_transformed_{len(df)}_rows.csv"

        print(f"\n   ✅ Successfully processed file: {modified_file_name}")
        print(f"   📊 Final columns: {list(df.columns)}")
        print(f"   📏 Final shape: {df.shape}")
        print(f"   📄 CSV file size: {len(csv_content)} bytes")

        return csv_content, modified_file_name, df

    except FileNotFoundError:
        print(f"❌ Error: File '{file_path}' not found!")
        return None, None, None
    except Exception as e:
        print(f"❌ Error processing file: {e}")
        import traceback
        traceback.print_exc()
        return None, None, None

def save_clean_dataset(df, file_name):
    """
    Save the transformed dataset locally
    """
    csv_file = f"transformed_{file_name}"
    df.to_csv(csv_file, index=False, encoding='utf-8')
    print(f"\n💾 Transformed dataset saved: {csv_file}")

    excel_file = csv_file.replace('.csv', '.xlsx')
    df.to_excel(excel_file, index=False)
    print(f"💾 Excel version saved: {excel_file}")

    return csv_file

def main():
    """
    Main function to execute all transformations
    """
    print("=" * 80)
    print("🚢 Ship Performance Dataset - Complete Transformation")
    print("=" * 80)

    # Specify the path to your Excel file
    excel_file_path = "C:/Users/Mikes/OneDrive/Pictures/MENA_BUSINESS_DATA/Transformation Schiff Sample File for Predictive analysis.xlsx"

    # Process and transform the file
    print("\n1️⃣ Reading and transforming Excel file...")
    file_content, modified_file_name, df = read_and_process_file(excel_file_path, max_rows=5000)

    if file_content is None:
        print("\n❌ Process failed. Please check if the file exists.")
        return

    # Save locally
    save_clean_dataset(df, modified_file_name)

    # Save transformation summary
    summary_file = f'transformation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt'
    with open(summary_file, 'w', encoding='utf-8') as f:
        f.write("TRANSFORMATION SUMMARY\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Original file: {excel_file_path}\n")
        f.write(f"Rows processed: {len(df)}\n")
        f.write(f"Final columns: {len(df.columns)}\n\n")
        f.write("Final columns list:\n")
        for col in df.columns:
            f.write(f"  - {col}\n")
        f.write("\n" + "=" * 50 + "\n\n")
        f.write("Key transformations applied:\n")
        f.write("  - Added IsRecurringCustomer flag (based on multiple transactions per Userid)\n")
        f.write("  - Converted DOB to Age (using TransactionDate as reference, not today's date)\n")
        f.write("  - Converted RegistrationDate to DaysSinceRegistration (days between registration and transaction)\n")
        f.write("  - Dropped CreateDate (duplicate of TransactionDate)\n")
        f.write("  - Dynamically cleaned Brand names using prefix matching\n")
        f.write("  - Cleaned SiteName variations\n")
        f.write("  - Merged contact method columns into single ContactMethod field\n")
        f.write("  - Split date columns into Year, Month, TimeOfMonth, Day components\n")
        f.write("  - Removed redundant columns (StoreId, Store, SiteType, etc.)\n")

    print(f"\n📄 Transformation summary saved: {summary_file}")

    print("\n" + "=" * 80)
    print("🎉 All transformations completed successfully! إن شاء الله")
    print(f"   ✅ {len(df)} rows processed")
    print(f"   ✅ {len(df.columns)} columns in final dataset")
    print("   ✅ Recurring customer flag added")
    print("   ✅ DOB converted to Age (using transaction date)")
    print("   ✅ RegistrationDate converted to DaysSinceRegistration")
    print("   ✅ CreateDate dropped (duplicate)")
    print("   ✅ Contact methods merged")
    print("   ✅ Date columns split into components")
    print("   ✅ SiteName and Brand dynamically cleaned")
    print("=" * 80)

if __name__ == "__main__":
    main()