diff --git a/transform.py b/transform.py index f62cf66..9dd5332 100644 --- a/transform.py +++ b/transform.py @@ -4,15 +4,16 @@ # بسم الله الرحمن الرحيم +# بسم الله الرحمن الرحيم + import base64 import json -import requests -import os import pandas as pd +import os +import numpy as np from datetime import datetime import io -import numpy as np -import re +from difflib import SequenceMatcher def sanitize_text(value): """ @@ -71,47 +72,99 @@ def clean_site_name(name): return name.title() -def clean_brand(brand): +def create_brand_mapping(series): """ - Clean Brand names by standardizing similar values + Dynamically create brand mapping by analyzing unique brand names + Uses first N characters and similarity matching + """ + # Get unique brand values (excluding nulls and unknowns) + unique_brands = series.dropna().unique() + unique_brands = [str(b).strip() for b in unique_brands if str(b).strip() != "" and str(b).strip().lower() != "unknown"] + + # Dictionary to store mappings + brand_map = {} + + # First, group by first 3-5 characters + brand_groups = {} + + for brand in unique_brands: + brand_lower = brand.lower() + + # Try different prefix lengths + for prefix_len in [5, 4, 3]: + if len(brand_lower) >= prefix_len: + prefix = brand_lower[:prefix_len] + if prefix not in brand_groups: + brand_groups[prefix] = [] + brand_groups[prefix].append(brand) + break + + # For each group, find the most common/canonical name + for prefix, brands in brand_groups.items(): + if len(brands) == 1: + # Single brand - use it as is (capitalized) + brand_map[brands[0].lower()] = brands[0].title() + else: + # Multiple brands with same prefix - find the most frequent or common one + # Count occurrences in the original series + brand_counts = series.value_counts() + + # Find the brand with highest count in this group + best_match = max(brands, key=lambda b: brand_counts.get(b, 0)) + canonical_name = best_match.title() + + # Map all variations to the canonical name + for brand in brands: + brand_map[brand.lower()] = canonical_name + + # Also check for brands that are substrings of others + sorted_brands = sorted(unique_brands, key=len, reverse=True) + for i, long_brand in enumerate(sorted_brands): + long_lower = long_brand.lower() + for short_brand in sorted_brands[i+1:]: + short_lower = short_brand.lower() + if short_lower in long_lower and len(short_lower) > 3: + # Short brand is a substring of long brand + if short_brand.lower() not in brand_map: + brand_map[short_lower] = long_brand.title() + + return brand_map + +def clean_brand_dynamic(brand, brand_mapping): + """ + Clean Brand names using dynamic mapping """ if pd.isna(brand) or brand == "": return "Unknown" - brand = str(brand).strip().lower() + brand_str = str(brand).strip() + brand_lower = brand_str.lower() - # Brand variations mapping - brand_mapping = { - 'nike': 'Nike', - 'nik e': 'Nike', - 'ni ke': 'Nike', - 'adidas': 'Adidas', - 'addidas': 'Adidas', - 'adidas ': 'Adidas', - 'puma': 'Puma', - 'pum a': 'Puma', - 'reebok': 'Reebok', - 'reebok ': 'Reebok', - 'reeb ok': 'Reebok', - 'gucci': 'Gucci', - 'gucc i': 'Gucci', - 'chanel': 'Chanel', - 'chan el': 'Chanel' - } + # Check if we have a mapping for this brand + if brand_lower in brand_mapping: + return brand_mapping[brand_lower] + # Try partial matching using first few characters for key, value in brand_mapping.items(): - if key in brand: - return value + # Check if brand starts with the same prefix + if len(brand_lower) >= 3 and len(key) >= 3: + if brand_lower[:3] == key[:3]: + return value - return brand.title() + # If not found, return title case + return brand_str.title() -def calculate_age_from_dob(dob_value): +def calculate_age_from_dob(dob_value, transaction_date): """ - Convert DOB to age, handle 1900-01-01 as Unknown + Convert DOB to age based on transaction date, not today's date + Handles 1900-01-01 as Unknown """ if pd.isna(dob_value) or dob_value == "": return "Unknown" + if pd.isna(transaction_date) or transaction_date == "": + return "Unknown" + dob_str = str(dob_value).strip() # Check for the placeholder date @@ -119,16 +172,25 @@ def calculate_age_from_dob(dob_value): return "Unknown" try: - # Try to parse the date + # Parse DOB if '-' in dob_str: - dob = pd.to_datetime(dob_str.split()[0]) # Handle datetime strings + dob = pd.to_datetime(dob_str.split()[0]) elif '/' in dob_str: dob = pd.to_datetime(dob_str) else: return "Unknown" - today = datetime.now() - age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day)) + # Parse Transaction Date + trans_date_str = str(transaction_date).strip() + if '-' in trans_date_str: + trans_date = pd.to_datetime(trans_date_str.split()[0]) + elif '/' in trans_date_str: + trans_date = pd.to_datetime(trans_date_str) + else: + return "Unknown" + + # Calculate age at time of transaction + age = trans_date.year - dob.year - ((trans_date.month, trans_date.day) < (dob.month, dob.day)) if age < 0 or age > 120: # Sanity check return "Unknown" @@ -137,6 +199,48 @@ def calculate_age_from_dob(dob_value): except: return "Unknown" +def calculate_registration_duration(registration_date, transaction_date): + """ + Calculate number of days between registration and transaction + """ + if pd.isna(registration_date) or registration_date == "": + return "Unknown" + + if pd.isna(transaction_date) or transaction_date == "": + return "Unknown" + + try: + # Parse Registration Date + reg_str = str(registration_date).strip() + if '-' in reg_str: + reg_date = pd.to_datetime(reg_str.split()[0]) + elif '/' in reg_str: + reg_date = pd.to_datetime(reg_str) + else: + return "Unknown" + + # Parse Transaction Date + trans_str = str(transaction_date).strip() + if '-' in trans_str: + trans_date = pd.to_datetime(trans_str.split()[0]) + elif '/' in trans_str: + trans_date = pd.to_datetime(trans_str) + else: + return "Unknown" + + # Calculate days difference + days_diff = (trans_date - reg_date).days + + if days_diff < 0: + return "0" # Transaction before registration - treat as 0 + + if days_diff > 3650: # Cap at 10 years (sanity check) + return "3650+" + + return days_diff + except: + return "Unknown" + def merge_contact_methods(row): """ Merge Email, SMS, Mail, Phone into one column with priority order @@ -157,7 +261,7 @@ def merge_contact_methods(row): return ','.join(contact_methods) # Return all methods as comma-separated -def extract_date_components(date_value, column_name): +def extract_date_components(date_value, column_name, reference_date=None): """ Extract Year, Month, TimeOfMonth, Day from date """ @@ -269,15 +373,21 @@ def transform_dataframe(df): if 'Gender' in df.columns: print(" ✅ Keeping 'Gender'") - # G > Convert DOB to Age - if 'DOB' in df.columns: - df['Age'] = df['DOB'].apply(calculate_age_from_dob) + # G > Convert DOB to Age (using TransactionDate as reference) + if 'DOB' in df.columns and 'TransactionDate' in df.columns: + df['Age'] = df.apply(lambda row: calculate_age_from_dob(row['DOB'], row['TransactionDate']), axis=1) df = df.drop(columns=['DOB']) - print(" ✅ Converted 'DOB' to 'Age' (1900-01-01 → Unknown)") + print(" ✅ Converted 'DOB' to 'Age' (using TransactionDate as reference)") + elif 'DOB' in df.columns: + print(" ⚠️ 'DOB' found but 'TransactionDate' missing - cannot calculate Age properly") - # H > Keep RegistrationDate - if 'RegistrationDate' in df.columns: - print(" ✅ Keeping 'RegistrationDate'") + # H > Convert RegistrationDate to duration (days since registration) + if 'RegistrationDate' in df.columns and 'TransactionDate' in df.columns: + df['DaysSinceRegistration'] = df.apply(lambda row: calculate_registration_duration(row['RegistrationDate'], row['TransactionDate']), axis=1) + df = df.drop(columns=['RegistrationDate']) + print(" ✅ Converted 'RegistrationDate' to 'DaysSinceRegistration' (days between registration and transaction)") + elif 'RegistrationDate' in df.columns: + print(" ⚠️ 'RegistrationDate' found but 'TransactionDate' missing - keeping as-is") # I > Drop FirstLoginDate if 'FirstLoginDate' in df.columns: @@ -370,19 +480,18 @@ def transform_dataframe(df): if 'Tier' in df.columns: print(" ✅ Keeping 'Tier'") - # AE, AF > Merge TransactionDate and CreateDate into date components - date_columns_to_process = [] + # AE > Convert TransactionDate into date components if 'TransactionDate' in df.columns: - date_columns_to_process.append(('TransactionDate', 'Transaction')) - if 'CreateDate' in df.columns: - date_columns_to_process.append(('CreateDate', 'Create')) - - for date_col, prefix in date_columns_to_process: - date_components = df[date_col].apply(lambda x: extract_date_components(x, prefix)) + date_components = df['TransactionDate'].apply(lambda x: extract_date_components(x, 'Transaction')) date_df = pd.DataFrame(date_components.tolist()) df = pd.concat([df, date_df], axis=1) - df = df.drop(columns=[date_col]) - print(f" ✅ Converted '{date_col}' into 4 columns ({prefix}_Year, {prefix}_Month, {prefix}_TimeOfMonth, {prefix}_Day)") + df = df.drop(columns=['TransactionDate']) + print(" ✅ Converted 'TransactionDate' into 4 columns (Transaction_Year, Transaction_Month, Transaction_TimeOfMonth, Transaction_Day)") + + # AF > Drop CreateDate (as requested - it's the same as TransactionDate) + if 'CreateDate' in df.columns: + df = df.drop(columns=['CreateDate']) + print(" 🗑️ Dropped 'CreateDate' (duplicate of TransactionDate)") # AG > Drop MemberId if 'MemberId' in df.columns: @@ -394,135 +503,128 @@ def transform_dataframe(df): df = df.drop(columns=['SiteId']) print(" 🗑️ Dropped 'SiteId'") - # AI > Drop ParentSiteId - if 'ParentSiteId' in df.columns: - df = df.drop(columns=['ParentSiteId']) - print(" 🗑️ Dropped 'ParentSiteId'") - - # AJ > Keep and clean SiteName + # AI > Clean and keep SiteName if 'SiteName' in df.columns: df['SiteName'] = df['SiteName'].apply(clean_site_name) print(" ✅ Kept and cleaned 'SiteName'") - # AK > Drop SiteType - if 'SiteType' in df.columns: - df = df.drop(columns=['SiteType']) - print(" 🗑️ Dropped 'SiteType'") - - # AL > Keep Quantity + # AJ > Keep Quantity if 'Quantity' in df.columns: print(" ✅ Keeping 'Quantity'") - # AM > Keep Amount + # AK > Keep Amount if 'Amount' in df.columns: print(" ✅ Keeping 'Amount'") - # AN > Drop RewardType + # AL > Drop RewardType if 'RewardType' in df.columns: df = df.drop(columns=['RewardType']) print(" 🗑️ Dropped 'RewardType'") - # AO > Keep Points + # AM > Keep Points if 'Points' in df.columns: print(" ✅ Keeping 'Points'") - # AP > Drop trxDetailId + # AN > Drop trxDetailId if 'trxDetailId' in df.columns: df = df.drop(columns=['trxDetailId']) print(" 🗑️ Dropped 'trxDetailId'") - # AQ > Drop TrxId + # AO > Drop TrxId if 'TrxId' in df.columns: df = df.drop(columns=['TrxId']) print(" 🗑️ Dropped 'TrxId'") - # AR > Drop TransactionStatusId + # AP > Drop TransactionStatusId if 'TransactionStatusId' in df.columns: df = df.drop(columns=['TransactionStatusId']) print(" 🗑️ Dropped 'TransactionStatusId'") - # AS > Keep TransactionStatusName + # AQ > Keep TransactionStatusName if 'TransactionStatusName' in df.columns: print(" ✅ Keeping 'TransactionStatusName'") - # AT > Drop TransactionTypeId + # AR > Drop TransactionTypeId if 'TransactionTypeId' in df.columns: df = df.drop(columns=['TransactionTypeId']) print(" 🗑️ Dropped 'TransactionTypeId'") - # AU > Keep TransactionTypeName + # AS > Keep TransactionTypeName if 'TransactionTypeName' in df.columns: print(" ✅ Keeping 'TransactionTypeName'") - # AV > Drop Reportable + # AT > Drop Reportable if 'Reportable' in df.columns: df = df.drop(columns=['Reportable']) print(" 🗑️ Dropped 'Reportable'") - # AW > Keep TransactionItemCode + # AU > Keep TransactionItemCode if 'TransactionItemCode' in df.columns: print(" ✅ Keeping 'TransactionItemCode'") - # AX > Keep AnalysisCode1 + # AV > Keep AnalysisCode1 if 'AnalysisCode1' in df.columns: print(" ✅ Keeping 'AnalysisCode1'") - # AY > Keep AnalysisCode2 + # AW > Keep AnalysisCode2 if 'AnalysisCode2' in df.columns: print(" ✅ Keeping 'AnalysisCode2'") - # AZ > Keep AnalysisCode3 + # AX > Keep AnalysisCode3 if 'AnalysisCode3' in df.columns: print(" ✅ Keeping 'AnalysisCode3'") - # BA > Keep AnalysisCode4 + # AY > Keep AnalysisCode4 if 'AnalysisCode4' in df.columns: print(" ✅ Keeping 'AnalysisCode4'") - # BB > Keep and clean Brand + # AZ > Dynamically clean Brand if 'Brand' in df.columns: - df['Brand'] = df['Brand'].apply(clean_brand) - print(" ✅ Kept and cleaned 'Brand'") + print(" 🔍 Analyzing unique brand names to create dynamic mapping...") + brand_mapping = create_brand_mapping(df['Brand']) + print(f" 📊 Created mapping for {len(brand_mapping)} unique brand variations") + df['Brand'] = df['Brand'].apply(lambda x: clean_brand_dynamic(x, brand_mapping)) + print(" ✅ Kept and dynamically cleaned 'Brand'") - # BC > Keep AnalysisCode6 + # BA > Keep AnalysisCode6 if 'AnalysisCode6' in df.columns: print(" ✅ Keeping 'AnalysisCode6'") - # BD > Keep AnalysisCode7 + # BB > Keep AnalysisCode7 if 'AnalysisCode7' in df.columns: print(" ✅ Keeping 'AnalysisCode7'") - # BE > Keep AnalysisCode8 + # BC > Keep AnalysisCode8 if 'AnalysisCode8' in df.columns: print(" ✅ Keeping 'AnalysisCode8'") - # BF > Keep Price + # BD > Keep Price if 'Price' in df.columns: print(" ✅ Keeping 'Price'") - # BG > Keep AnalysisCode10 + # BE > Keep AnalysisCode10 if 'AnalysisCode10' in df.columns: print(" ✅ Keeping 'AnalysisCode10'") - # BH > Keep InvalidReason + # BF > Keep InvalidReason if 'InvalidReason' in df.columns: print(" ✅ Keeping 'InvalidReason'") - # BI > Drop Description + # BG > Drop Description if 'Description' in df.columns: df = df.drop(columns=['Description']) print(" 🗑️ Dropped 'Description'") - # BJ > Drop PromotionId + # BH > Drop PromotionId if 'PromotionId' in df.columns: df = df.drop(columns=['PromotionId']) print(" 🗑️ Dropped 'PromotionId'") - # BK > Keep PromotionName + # BI > Keep PromotionName if 'PromotionName' in df.columns: print(" ✅ Keeping 'PromotionName'") - # BL > Convert PromotionStartDate into 4 columns + # BJ > Convert PromotionStartDate into 4 columns if 'PromotionStartDate' in df.columns: date_components = df['PromotionStartDate'].apply(lambda x: extract_date_components(x, 'PromotionStart')) date_df = pd.DataFrame(date_components.tolist()) @@ -530,32 +632,32 @@ def transform_dataframe(df): df = df.drop(columns=['PromotionStartDate']) print(" ✅ Converted 'PromotionStartDate' into 4 columns (PromotionStart_Year, PromotionStart_Month, PromotionStart_TimeOfMonth, PromotionStart_Day)") - # BM > Drop PromotionEndDate + # BK > Drop PromotionEndDate if 'PromotionEndDate' in df.columns: df = df.drop(columns=['PromotionEndDate']) print(" 🗑️ Dropped 'PromotionEndDate'") - # BN > Drop PromotionOfferTypeId + # BL > Drop PromotionOfferTypeId if 'PromotionOfferTypeId' in df.columns: df = df.drop(columns=['PromotionOfferTypeId']) print(" 🗑️ Dropped 'PromotionOfferTypeId'") - # BO > Drop PromotionOfferTypeName + # BM > Drop PromotionOfferTypeName if 'PromotionOfferTypeName' in df.columns: df = df.drop(columns=['PromotionOfferTypeName']) print(" 🗑️ Dropped 'PromotionOfferTypeName'") - # BP > Drop PromotionSiteId + # BN > Drop PromotionSiteId if 'PromotionSiteId' in df.columns: df = df.drop(columns=['PromotionSiteId']) print(" 🗑️ Dropped 'PromotionSiteId'") - # BQ > Drop PromotionSite + # BO > Drop PromotionSite if 'PromotionSite' in df.columns: df = df.drop(columns=['PromotionSite']) print(" 🗑️ Dropped 'PromotionSite'") - # BR > Drop QualifyingProductQuantity + # BP > Drop QualifyingProductQuantity if 'QualifyingProductQuantity' in df.columns: df = df.drop(columns=['QualifyingProductQuantity']) print(" 🗑️ Dropped 'QualifyingProductQuantity'") @@ -619,64 +721,6 @@ def read_and_process_file(file_path, max_rows=5000): traceback.print_exc() return None, None, None -def encode_file_to_base64(file_content): - """ - Encode file content to base64 string - """ - try: - base64_encoded = base64.b64encode(file_content).decode('ascii') - return base64_encoded - except Exception as e: - print(f"❌ Error encoding to base64: {e}") - cleaned_content = bytes([b for b in file_content if b < 128]) - base64_encoded = base64.b64encode(cleaned_content).decode('ascii') - return base64_encoded - -def send_to_api(file_name, base64_data): - """ - Send the encoded file data to the API - """ - api_url = "https://problab-api-0004c00ee319.hosted.ghaymah.systems/process_dataset" - - payload = { - "event": { - "data": { - "new": { - "id": "snipp_transformed", - "file_data": base64_data, - "file_name": file_name, - "hasHeader": True, - "delimiter": "," - } - } - } - } - - headers = { - 'Content-Type': 'application/json', - 'User-Agent': 'Data-Transformer/1.0', - 'Accept': 'application/json' - } - - try: - print(f"\n🔄 Sending transformed file '{file_name}' to API...") - print(f"📊 Base64 data size: {len(base64_data)} characters") - - response = requests.post(api_url, json=payload, headers=headers, timeout=60) - - if response.status_code == 200: - print("✅ File sent successfully!") - print(f"📋 Response status: {response.status_code}") - else: - print(f"❌ Failed to send file. Status code: {response.status_code}") - print(f"📋 Response: {response.text[:500]}") - - return response - - except Exception as e: - print(f"❌ Error occurred while sending to API: {e}") - return None - def save_clean_dataset(df, file_name): """ Save the transformed dataset locally @@ -693,10 +737,10 @@ def save_clean_dataset(df, file_name): def main(): """ - Main function to execute all transformations and upload + Main function to execute all transformations """ print("=" * 80) - print("🚢 Ship Performance Dataset - Complete Transformation & Upload") + print("🚢 Ship Performance Dataset - Complete Transformation") print("=" * 80) # Specify the path to your Excel file @@ -710,21 +754,12 @@ def main(): print("\n❌ Process failed. Please check if the file exists.") return - # Encode to base64 - print("\n2️⃣ Encoding transformed file to base64...") - base64_data = encode_file_to_base64(file_content) - print(f" ✅ Encoding complete ({len(base64_data)} characters)") - - # Send to API - print("\n3️⃣ Sending transformed data to API...") - response = send_to_api(modified_file_name, base64_data) - # Save locally save_clean_dataset(df, modified_file_name) # Save transformation summary summary_file = f'transformation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt' - with open(summary_file, 'w') as f: + with open(summary_file, 'w', encoding='utf-8') as f: f.write("TRANSFORMATION SUMMARY\n") f.write("=" * 50 + "\n\n") f.write(f"Original file: {excel_file_path}\n") @@ -733,23 +768,34 @@ def main(): f.write("Final columns list:\n") for col in df.columns: f.write(f" - {col}\n") + f.write("\n" + "=" * 50 + "\n\n") + f.write("Key transformations applied:\n") + f.write(" - Added IsRecurringCustomer flag (based on multiple transactions per Userid)\n") + f.write(" - Converted DOB to Age (using TransactionDate as reference, not today's date)\n") + f.write(" - Converted RegistrationDate to DaysSinceRegistration (days between registration and transaction)\n") + f.write(" - Dropped CreateDate (duplicate of TransactionDate)\n") + f.write(" - Dynamically cleaned Brand names using prefix matching\n") + f.write(" - Cleaned SiteName variations\n") + f.write(" - Merged contact method columns into single ContactMethod field\n") + f.write(" - Split date columns into Year, Month, TimeOfMonth, Day components\n") + f.write(" - Removed redundant columns (StoreId, Store, SiteType, etc.)\n") print(f"\n📄 Transformation summary saved: {summary_file}") print("\n" + "=" * 80) - if response and response.status_code == 200: - print("🎉 All transformations completed and file uploaded successfully! إن شاء الله") - print(f" ✅ {len(df)} rows processed") - print(f" ✅ {len(df.columns)} columns in final dataset") - print(" ✅ Recurring customer flag added") - print(" ✅ DOB converted to Age") - print(" ✅ Contact methods merged") - print(" ✅ Date columns split into components") - print(" ✅ SiteName and Brand cleaned") - else: - print("⚠️ Process completed but API upload may have failed.") - print(" 💡 Transformed file saved locally for inspection.") + print("🎉 All transformations completed successfully! إن شاء الله") + print(f" ✅ {len(df)} rows processed") + print(f" ✅ {len(df.columns)} columns in final dataset") + print(" ✅ Recurring customer flag added") + print(" ✅ DOB converted to Age (using transaction date)") + print(" ✅ RegistrationDate converted to DaysSinceRegistration") + print(" ✅ CreateDate dropped (duplicate)") + print(" ✅ Contact methods merged") + print(" ✅ Date columns split into components") + print(" ✅ SiteName and Brand dynamically cleaned") print("=" * 80) if __name__ == "__main__": - main() \ No newline at end of file + main() + + \ No newline at end of file