Update transform.py
هذا الالتزام موجود في:
408
transform.py
408
transform.py
@@ -4,15 +4,16 @@
|
|||||||
|
|
||||||
# بسم الله الرحمن الرحيم
|
# بسم الله الرحمن الرحيم
|
||||||
|
|
||||||
|
# بسم الله الرحمن الرحيم
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import json
|
import json
|
||||||
import requests
|
|
||||||
import os
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import io
|
import io
|
||||||
import numpy as np
|
from difflib import SequenceMatcher
|
||||||
import re
|
|
||||||
|
|
||||||
def sanitize_text(value):
|
def sanitize_text(value):
|
||||||
"""
|
"""
|
||||||
@@ -71,47 +72,99 @@ def clean_site_name(name):
|
|||||||
|
|
||||||
return name.title()
|
return name.title()
|
||||||
|
|
||||||
def clean_brand(brand):
|
def create_brand_mapping(series):
|
||||||
"""
|
"""
|
||||||
Clean Brand names by standardizing similar values
|
Dynamically create brand mapping by analyzing unique brand names
|
||||||
|
Uses first N characters and similarity matching
|
||||||
|
"""
|
||||||
|
# Get unique brand values (excluding nulls and unknowns)
|
||||||
|
unique_brands = series.dropna().unique()
|
||||||
|
unique_brands = [str(b).strip() for b in unique_brands if str(b).strip() != "" and str(b).strip().lower() != "unknown"]
|
||||||
|
|
||||||
|
# Dictionary to store mappings
|
||||||
|
brand_map = {}
|
||||||
|
|
||||||
|
# First, group by first 3-5 characters
|
||||||
|
brand_groups = {}
|
||||||
|
|
||||||
|
for brand in unique_brands:
|
||||||
|
brand_lower = brand.lower()
|
||||||
|
|
||||||
|
# Try different prefix lengths
|
||||||
|
for prefix_len in [5, 4, 3]:
|
||||||
|
if len(brand_lower) >= prefix_len:
|
||||||
|
prefix = brand_lower[:prefix_len]
|
||||||
|
if prefix not in brand_groups:
|
||||||
|
brand_groups[prefix] = []
|
||||||
|
brand_groups[prefix].append(brand)
|
||||||
|
break
|
||||||
|
|
||||||
|
# For each group, find the most common/canonical name
|
||||||
|
for prefix, brands in brand_groups.items():
|
||||||
|
if len(brands) == 1:
|
||||||
|
# Single brand - use it as is (capitalized)
|
||||||
|
brand_map[brands[0].lower()] = brands[0].title()
|
||||||
|
else:
|
||||||
|
# Multiple brands with same prefix - find the most frequent or common one
|
||||||
|
# Count occurrences in the original series
|
||||||
|
brand_counts = series.value_counts()
|
||||||
|
|
||||||
|
# Find the brand with highest count in this group
|
||||||
|
best_match = max(brands, key=lambda b: brand_counts.get(b, 0))
|
||||||
|
canonical_name = best_match.title()
|
||||||
|
|
||||||
|
# Map all variations to the canonical name
|
||||||
|
for brand in brands:
|
||||||
|
brand_map[brand.lower()] = canonical_name
|
||||||
|
|
||||||
|
# Also check for brands that are substrings of others
|
||||||
|
sorted_brands = sorted(unique_brands, key=len, reverse=True)
|
||||||
|
for i, long_brand in enumerate(sorted_brands):
|
||||||
|
long_lower = long_brand.lower()
|
||||||
|
for short_brand in sorted_brands[i+1:]:
|
||||||
|
short_lower = short_brand.lower()
|
||||||
|
if short_lower in long_lower and len(short_lower) > 3:
|
||||||
|
# Short brand is a substring of long brand
|
||||||
|
if short_brand.lower() not in brand_map:
|
||||||
|
brand_map[short_lower] = long_brand.title()
|
||||||
|
|
||||||
|
return brand_map
|
||||||
|
|
||||||
|
def clean_brand_dynamic(brand, brand_mapping):
|
||||||
|
"""
|
||||||
|
Clean Brand names using dynamic mapping
|
||||||
"""
|
"""
|
||||||
if pd.isna(brand) or brand == "":
|
if pd.isna(brand) or brand == "":
|
||||||
return "Unknown"
|
return "Unknown"
|
||||||
|
|
||||||
brand = str(brand).strip().lower()
|
brand_str = str(brand).strip()
|
||||||
|
brand_lower = brand_str.lower()
|
||||||
|
|
||||||
# Brand variations mapping
|
# Check if we have a mapping for this brand
|
||||||
brand_mapping = {
|
if brand_lower in brand_mapping:
|
||||||
'nike': 'Nike',
|
return brand_mapping[brand_lower]
|
||||||
'nik e': 'Nike',
|
|
||||||
'ni ke': 'Nike',
|
|
||||||
'adidas': 'Adidas',
|
|
||||||
'addidas': 'Adidas',
|
|
||||||
'adidas ': 'Adidas',
|
|
||||||
'puma': 'Puma',
|
|
||||||
'pum a': 'Puma',
|
|
||||||
'reebok': 'Reebok',
|
|
||||||
'reebok ': 'Reebok',
|
|
||||||
'reeb ok': 'Reebok',
|
|
||||||
'gucci': 'Gucci',
|
|
||||||
'gucc i': 'Gucci',
|
|
||||||
'chanel': 'Chanel',
|
|
||||||
'chan el': 'Chanel'
|
|
||||||
}
|
|
||||||
|
|
||||||
|
# Try partial matching using first few characters
|
||||||
for key, value in brand_mapping.items():
|
for key, value in brand_mapping.items():
|
||||||
if key in brand:
|
# Check if brand starts with the same prefix
|
||||||
return value
|
if len(brand_lower) >= 3 and len(key) >= 3:
|
||||||
|
if brand_lower[:3] == key[:3]:
|
||||||
|
return value
|
||||||
|
|
||||||
return brand.title()
|
# If not found, return title case
|
||||||
|
return brand_str.title()
|
||||||
|
|
||||||
def calculate_age_from_dob(dob_value):
|
def calculate_age_from_dob(dob_value, transaction_date):
|
||||||
"""
|
"""
|
||||||
Convert DOB to age, handle 1900-01-01 as Unknown
|
Convert DOB to age based on transaction date, not today's date
|
||||||
|
Handles 1900-01-01 as Unknown
|
||||||
"""
|
"""
|
||||||
if pd.isna(dob_value) or dob_value == "":
|
if pd.isna(dob_value) or dob_value == "":
|
||||||
return "Unknown"
|
return "Unknown"
|
||||||
|
|
||||||
|
if pd.isna(transaction_date) or transaction_date == "":
|
||||||
|
return "Unknown"
|
||||||
|
|
||||||
dob_str = str(dob_value).strip()
|
dob_str = str(dob_value).strip()
|
||||||
|
|
||||||
# Check for the placeholder date
|
# Check for the placeholder date
|
||||||
@@ -119,16 +172,25 @@ def calculate_age_from_dob(dob_value):
|
|||||||
return "Unknown"
|
return "Unknown"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Try to parse the date
|
# Parse DOB
|
||||||
if '-' in dob_str:
|
if '-' in dob_str:
|
||||||
dob = pd.to_datetime(dob_str.split()[0]) # Handle datetime strings
|
dob = pd.to_datetime(dob_str.split()[0])
|
||||||
elif '/' in dob_str:
|
elif '/' in dob_str:
|
||||||
dob = pd.to_datetime(dob_str)
|
dob = pd.to_datetime(dob_str)
|
||||||
else:
|
else:
|
||||||
return "Unknown"
|
return "Unknown"
|
||||||
|
|
||||||
today = datetime.now()
|
# Parse Transaction Date
|
||||||
age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
|
trans_date_str = str(transaction_date).strip()
|
||||||
|
if '-' in trans_date_str:
|
||||||
|
trans_date = pd.to_datetime(trans_date_str.split()[0])
|
||||||
|
elif '/' in trans_date_str:
|
||||||
|
trans_date = pd.to_datetime(trans_date_str)
|
||||||
|
else:
|
||||||
|
return "Unknown"
|
||||||
|
|
||||||
|
# Calculate age at time of transaction
|
||||||
|
age = trans_date.year - dob.year - ((trans_date.month, trans_date.day) < (dob.month, dob.day))
|
||||||
|
|
||||||
if age < 0 or age > 120: # Sanity check
|
if age < 0 or age > 120: # Sanity check
|
||||||
return "Unknown"
|
return "Unknown"
|
||||||
@@ -137,6 +199,48 @@ def calculate_age_from_dob(dob_value):
|
|||||||
except:
|
except:
|
||||||
return "Unknown"
|
return "Unknown"
|
||||||
|
|
||||||
|
def calculate_registration_duration(registration_date, transaction_date):
|
||||||
|
"""
|
||||||
|
Calculate number of days between registration and transaction
|
||||||
|
"""
|
||||||
|
if pd.isna(registration_date) or registration_date == "":
|
||||||
|
return "Unknown"
|
||||||
|
|
||||||
|
if pd.isna(transaction_date) or transaction_date == "":
|
||||||
|
return "Unknown"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Parse Registration Date
|
||||||
|
reg_str = str(registration_date).strip()
|
||||||
|
if '-' in reg_str:
|
||||||
|
reg_date = pd.to_datetime(reg_str.split()[0])
|
||||||
|
elif '/' in reg_str:
|
||||||
|
reg_date = pd.to_datetime(reg_str)
|
||||||
|
else:
|
||||||
|
return "Unknown"
|
||||||
|
|
||||||
|
# Parse Transaction Date
|
||||||
|
trans_str = str(transaction_date).strip()
|
||||||
|
if '-' in trans_str:
|
||||||
|
trans_date = pd.to_datetime(trans_str.split()[0])
|
||||||
|
elif '/' in trans_str:
|
||||||
|
trans_date = pd.to_datetime(trans_str)
|
||||||
|
else:
|
||||||
|
return "Unknown"
|
||||||
|
|
||||||
|
# Calculate days difference
|
||||||
|
days_diff = (trans_date - reg_date).days
|
||||||
|
|
||||||
|
if days_diff < 0:
|
||||||
|
return "0" # Transaction before registration - treat as 0
|
||||||
|
|
||||||
|
if days_diff > 3650: # Cap at 10 years (sanity check)
|
||||||
|
return "3650+"
|
||||||
|
|
||||||
|
return days_diff
|
||||||
|
except:
|
||||||
|
return "Unknown"
|
||||||
|
|
||||||
def merge_contact_methods(row):
|
def merge_contact_methods(row):
|
||||||
"""
|
"""
|
||||||
Merge Email, SMS, Mail, Phone into one column with priority order
|
Merge Email, SMS, Mail, Phone into one column with priority order
|
||||||
@@ -157,7 +261,7 @@ def merge_contact_methods(row):
|
|||||||
|
|
||||||
return ','.join(contact_methods) # Return all methods as comma-separated
|
return ','.join(contact_methods) # Return all methods as comma-separated
|
||||||
|
|
||||||
def extract_date_components(date_value, column_name):
|
def extract_date_components(date_value, column_name, reference_date=None):
|
||||||
"""
|
"""
|
||||||
Extract Year, Month, TimeOfMonth, Day from date
|
Extract Year, Month, TimeOfMonth, Day from date
|
||||||
"""
|
"""
|
||||||
@@ -269,15 +373,21 @@ def transform_dataframe(df):
|
|||||||
if 'Gender' in df.columns:
|
if 'Gender' in df.columns:
|
||||||
print(" ✅ Keeping 'Gender'")
|
print(" ✅ Keeping 'Gender'")
|
||||||
|
|
||||||
# G > Convert DOB to Age
|
# G > Convert DOB to Age (using TransactionDate as reference)
|
||||||
if 'DOB' in df.columns:
|
if 'DOB' in df.columns and 'TransactionDate' in df.columns:
|
||||||
df['Age'] = df['DOB'].apply(calculate_age_from_dob)
|
df['Age'] = df.apply(lambda row: calculate_age_from_dob(row['DOB'], row['TransactionDate']), axis=1)
|
||||||
df = df.drop(columns=['DOB'])
|
df = df.drop(columns=['DOB'])
|
||||||
print(" ✅ Converted 'DOB' to 'Age' (1900-01-01 → Unknown)")
|
print(" ✅ Converted 'DOB' to 'Age' (using TransactionDate as reference)")
|
||||||
|
elif 'DOB' in df.columns:
|
||||||
|
print(" ⚠️ 'DOB' found but 'TransactionDate' missing - cannot calculate Age properly")
|
||||||
|
|
||||||
# H > Keep RegistrationDate
|
# H > Convert RegistrationDate to duration (days since registration)
|
||||||
if 'RegistrationDate' in df.columns:
|
if 'RegistrationDate' in df.columns and 'TransactionDate' in df.columns:
|
||||||
print(" ✅ Keeping 'RegistrationDate'")
|
df['DaysSinceRegistration'] = df.apply(lambda row: calculate_registration_duration(row['RegistrationDate'], row['TransactionDate']), axis=1)
|
||||||
|
df = df.drop(columns=['RegistrationDate'])
|
||||||
|
print(" ✅ Converted 'RegistrationDate' to 'DaysSinceRegistration' (days between registration and transaction)")
|
||||||
|
elif 'RegistrationDate' in df.columns:
|
||||||
|
print(" ⚠️ 'RegistrationDate' found but 'TransactionDate' missing - keeping as-is")
|
||||||
|
|
||||||
# I > Drop FirstLoginDate
|
# I > Drop FirstLoginDate
|
||||||
if 'FirstLoginDate' in df.columns:
|
if 'FirstLoginDate' in df.columns:
|
||||||
@@ -370,19 +480,18 @@ def transform_dataframe(df):
|
|||||||
if 'Tier' in df.columns:
|
if 'Tier' in df.columns:
|
||||||
print(" ✅ Keeping 'Tier'")
|
print(" ✅ Keeping 'Tier'")
|
||||||
|
|
||||||
# AE, AF > Merge TransactionDate and CreateDate into date components
|
# AE > Convert TransactionDate into date components
|
||||||
date_columns_to_process = []
|
|
||||||
if 'TransactionDate' in df.columns:
|
if 'TransactionDate' in df.columns:
|
||||||
date_columns_to_process.append(('TransactionDate', 'Transaction'))
|
date_components = df['TransactionDate'].apply(lambda x: extract_date_components(x, 'Transaction'))
|
||||||
if 'CreateDate' in df.columns:
|
|
||||||
date_columns_to_process.append(('CreateDate', 'Create'))
|
|
||||||
|
|
||||||
for date_col, prefix in date_columns_to_process:
|
|
||||||
date_components = df[date_col].apply(lambda x: extract_date_components(x, prefix))
|
|
||||||
date_df = pd.DataFrame(date_components.tolist())
|
date_df = pd.DataFrame(date_components.tolist())
|
||||||
df = pd.concat([df, date_df], axis=1)
|
df = pd.concat([df, date_df], axis=1)
|
||||||
df = df.drop(columns=[date_col])
|
df = df.drop(columns=['TransactionDate'])
|
||||||
print(f" ✅ Converted '{date_col}' into 4 columns ({prefix}_Year, {prefix}_Month, {prefix}_TimeOfMonth, {prefix}_Day)")
|
print(" ✅ Converted 'TransactionDate' into 4 columns (Transaction_Year, Transaction_Month, Transaction_TimeOfMonth, Transaction_Day)")
|
||||||
|
|
||||||
|
# AF > Drop CreateDate (as requested - it's the same as TransactionDate)
|
||||||
|
if 'CreateDate' in df.columns:
|
||||||
|
df = df.drop(columns=['CreateDate'])
|
||||||
|
print(" 🗑️ Dropped 'CreateDate' (duplicate of TransactionDate)")
|
||||||
|
|
||||||
# AG > Drop MemberId
|
# AG > Drop MemberId
|
||||||
if 'MemberId' in df.columns:
|
if 'MemberId' in df.columns:
|
||||||
@@ -394,135 +503,128 @@ def transform_dataframe(df):
|
|||||||
df = df.drop(columns=['SiteId'])
|
df = df.drop(columns=['SiteId'])
|
||||||
print(" 🗑️ Dropped 'SiteId'")
|
print(" 🗑️ Dropped 'SiteId'")
|
||||||
|
|
||||||
# AI > Drop ParentSiteId
|
# AI > Clean and keep SiteName
|
||||||
if 'ParentSiteId' in df.columns:
|
|
||||||
df = df.drop(columns=['ParentSiteId'])
|
|
||||||
print(" 🗑️ Dropped 'ParentSiteId'")
|
|
||||||
|
|
||||||
# AJ > Keep and clean SiteName
|
|
||||||
if 'SiteName' in df.columns:
|
if 'SiteName' in df.columns:
|
||||||
df['SiteName'] = df['SiteName'].apply(clean_site_name)
|
df['SiteName'] = df['SiteName'].apply(clean_site_name)
|
||||||
print(" ✅ Kept and cleaned 'SiteName'")
|
print(" ✅ Kept and cleaned 'SiteName'")
|
||||||
|
|
||||||
# AK > Drop SiteType
|
# AJ > Keep Quantity
|
||||||
if 'SiteType' in df.columns:
|
|
||||||
df = df.drop(columns=['SiteType'])
|
|
||||||
print(" 🗑️ Dropped 'SiteType'")
|
|
||||||
|
|
||||||
# AL > Keep Quantity
|
|
||||||
if 'Quantity' in df.columns:
|
if 'Quantity' in df.columns:
|
||||||
print(" ✅ Keeping 'Quantity'")
|
print(" ✅ Keeping 'Quantity'")
|
||||||
|
|
||||||
# AM > Keep Amount
|
# AK > Keep Amount
|
||||||
if 'Amount' in df.columns:
|
if 'Amount' in df.columns:
|
||||||
print(" ✅ Keeping 'Amount'")
|
print(" ✅ Keeping 'Amount'")
|
||||||
|
|
||||||
# AN > Drop RewardType
|
# AL > Drop RewardType
|
||||||
if 'RewardType' in df.columns:
|
if 'RewardType' in df.columns:
|
||||||
df = df.drop(columns=['RewardType'])
|
df = df.drop(columns=['RewardType'])
|
||||||
print(" 🗑️ Dropped 'RewardType'")
|
print(" 🗑️ Dropped 'RewardType'")
|
||||||
|
|
||||||
# AO > Keep Points
|
# AM > Keep Points
|
||||||
if 'Points' in df.columns:
|
if 'Points' in df.columns:
|
||||||
print(" ✅ Keeping 'Points'")
|
print(" ✅ Keeping 'Points'")
|
||||||
|
|
||||||
# AP > Drop trxDetailId
|
# AN > Drop trxDetailId
|
||||||
if 'trxDetailId' in df.columns:
|
if 'trxDetailId' in df.columns:
|
||||||
df = df.drop(columns=['trxDetailId'])
|
df = df.drop(columns=['trxDetailId'])
|
||||||
print(" 🗑️ Dropped 'trxDetailId'")
|
print(" 🗑️ Dropped 'trxDetailId'")
|
||||||
|
|
||||||
# AQ > Drop TrxId
|
# AO > Drop TrxId
|
||||||
if 'TrxId' in df.columns:
|
if 'TrxId' in df.columns:
|
||||||
df = df.drop(columns=['TrxId'])
|
df = df.drop(columns=['TrxId'])
|
||||||
print(" 🗑️ Dropped 'TrxId'")
|
print(" 🗑️ Dropped 'TrxId'")
|
||||||
|
|
||||||
# AR > Drop TransactionStatusId
|
# AP > Drop TransactionStatusId
|
||||||
if 'TransactionStatusId' in df.columns:
|
if 'TransactionStatusId' in df.columns:
|
||||||
df = df.drop(columns=['TransactionStatusId'])
|
df = df.drop(columns=['TransactionStatusId'])
|
||||||
print(" 🗑️ Dropped 'TransactionStatusId'")
|
print(" 🗑️ Dropped 'TransactionStatusId'")
|
||||||
|
|
||||||
# AS > Keep TransactionStatusName
|
# AQ > Keep TransactionStatusName
|
||||||
if 'TransactionStatusName' in df.columns:
|
if 'TransactionStatusName' in df.columns:
|
||||||
print(" ✅ Keeping 'TransactionStatusName'")
|
print(" ✅ Keeping 'TransactionStatusName'")
|
||||||
|
|
||||||
# AT > Drop TransactionTypeId
|
# AR > Drop TransactionTypeId
|
||||||
if 'TransactionTypeId' in df.columns:
|
if 'TransactionTypeId' in df.columns:
|
||||||
df = df.drop(columns=['TransactionTypeId'])
|
df = df.drop(columns=['TransactionTypeId'])
|
||||||
print(" 🗑️ Dropped 'TransactionTypeId'")
|
print(" 🗑️ Dropped 'TransactionTypeId'")
|
||||||
|
|
||||||
# AU > Keep TransactionTypeName
|
# AS > Keep TransactionTypeName
|
||||||
if 'TransactionTypeName' in df.columns:
|
if 'TransactionTypeName' in df.columns:
|
||||||
print(" ✅ Keeping 'TransactionTypeName'")
|
print(" ✅ Keeping 'TransactionTypeName'")
|
||||||
|
|
||||||
# AV > Drop Reportable
|
# AT > Drop Reportable
|
||||||
if 'Reportable' in df.columns:
|
if 'Reportable' in df.columns:
|
||||||
df = df.drop(columns=['Reportable'])
|
df = df.drop(columns=['Reportable'])
|
||||||
print(" 🗑️ Dropped 'Reportable'")
|
print(" 🗑️ Dropped 'Reportable'")
|
||||||
|
|
||||||
# AW > Keep TransactionItemCode
|
# AU > Keep TransactionItemCode
|
||||||
if 'TransactionItemCode' in df.columns:
|
if 'TransactionItemCode' in df.columns:
|
||||||
print(" ✅ Keeping 'TransactionItemCode'")
|
print(" ✅ Keeping 'TransactionItemCode'")
|
||||||
|
|
||||||
# AX > Keep AnalysisCode1
|
# AV > Keep AnalysisCode1
|
||||||
if 'AnalysisCode1' in df.columns:
|
if 'AnalysisCode1' in df.columns:
|
||||||
print(" ✅ Keeping 'AnalysisCode1'")
|
print(" ✅ Keeping 'AnalysisCode1'")
|
||||||
|
|
||||||
# AY > Keep AnalysisCode2
|
# AW > Keep AnalysisCode2
|
||||||
if 'AnalysisCode2' in df.columns:
|
if 'AnalysisCode2' in df.columns:
|
||||||
print(" ✅ Keeping 'AnalysisCode2'")
|
print(" ✅ Keeping 'AnalysisCode2'")
|
||||||
|
|
||||||
# AZ > Keep AnalysisCode3
|
# AX > Keep AnalysisCode3
|
||||||
if 'AnalysisCode3' in df.columns:
|
if 'AnalysisCode3' in df.columns:
|
||||||
print(" ✅ Keeping 'AnalysisCode3'")
|
print(" ✅ Keeping 'AnalysisCode3'")
|
||||||
|
|
||||||
# BA > Keep AnalysisCode4
|
# AY > Keep AnalysisCode4
|
||||||
if 'AnalysisCode4' in df.columns:
|
if 'AnalysisCode4' in df.columns:
|
||||||
print(" ✅ Keeping 'AnalysisCode4'")
|
print(" ✅ Keeping 'AnalysisCode4'")
|
||||||
|
|
||||||
# BB > Keep and clean Brand
|
# AZ > Dynamically clean Brand
|
||||||
if 'Brand' in df.columns:
|
if 'Brand' in df.columns:
|
||||||
df['Brand'] = df['Brand'].apply(clean_brand)
|
print(" 🔍 Analyzing unique brand names to create dynamic mapping...")
|
||||||
print(" ✅ Kept and cleaned 'Brand'")
|
brand_mapping = create_brand_mapping(df['Brand'])
|
||||||
|
print(f" 📊 Created mapping for {len(brand_mapping)} unique brand variations")
|
||||||
|
df['Brand'] = df['Brand'].apply(lambda x: clean_brand_dynamic(x, brand_mapping))
|
||||||
|
print(" ✅ Kept and dynamically cleaned 'Brand'")
|
||||||
|
|
||||||
# BC > Keep AnalysisCode6
|
# BA > Keep AnalysisCode6
|
||||||
if 'AnalysisCode6' in df.columns:
|
if 'AnalysisCode6' in df.columns:
|
||||||
print(" ✅ Keeping 'AnalysisCode6'")
|
print(" ✅ Keeping 'AnalysisCode6'")
|
||||||
|
|
||||||
# BD > Keep AnalysisCode7
|
# BB > Keep AnalysisCode7
|
||||||
if 'AnalysisCode7' in df.columns:
|
if 'AnalysisCode7' in df.columns:
|
||||||
print(" ✅ Keeping 'AnalysisCode7'")
|
print(" ✅ Keeping 'AnalysisCode7'")
|
||||||
|
|
||||||
# BE > Keep AnalysisCode8
|
# BC > Keep AnalysisCode8
|
||||||
if 'AnalysisCode8' in df.columns:
|
if 'AnalysisCode8' in df.columns:
|
||||||
print(" ✅ Keeping 'AnalysisCode8'")
|
print(" ✅ Keeping 'AnalysisCode8'")
|
||||||
|
|
||||||
# BF > Keep Price
|
# BD > Keep Price
|
||||||
if 'Price' in df.columns:
|
if 'Price' in df.columns:
|
||||||
print(" ✅ Keeping 'Price'")
|
print(" ✅ Keeping 'Price'")
|
||||||
|
|
||||||
# BG > Keep AnalysisCode10
|
# BE > Keep AnalysisCode10
|
||||||
if 'AnalysisCode10' in df.columns:
|
if 'AnalysisCode10' in df.columns:
|
||||||
print(" ✅ Keeping 'AnalysisCode10'")
|
print(" ✅ Keeping 'AnalysisCode10'")
|
||||||
|
|
||||||
# BH > Keep InvalidReason
|
# BF > Keep InvalidReason
|
||||||
if 'InvalidReason' in df.columns:
|
if 'InvalidReason' in df.columns:
|
||||||
print(" ✅ Keeping 'InvalidReason'")
|
print(" ✅ Keeping 'InvalidReason'")
|
||||||
|
|
||||||
# BI > Drop Description
|
# BG > Drop Description
|
||||||
if 'Description' in df.columns:
|
if 'Description' in df.columns:
|
||||||
df = df.drop(columns=['Description'])
|
df = df.drop(columns=['Description'])
|
||||||
print(" 🗑️ Dropped 'Description'")
|
print(" 🗑️ Dropped 'Description'")
|
||||||
|
|
||||||
# BJ > Drop PromotionId
|
# BH > Drop PromotionId
|
||||||
if 'PromotionId' in df.columns:
|
if 'PromotionId' in df.columns:
|
||||||
df = df.drop(columns=['PromotionId'])
|
df = df.drop(columns=['PromotionId'])
|
||||||
print(" 🗑️ Dropped 'PromotionId'")
|
print(" 🗑️ Dropped 'PromotionId'")
|
||||||
|
|
||||||
# BK > Keep PromotionName
|
# BI > Keep PromotionName
|
||||||
if 'PromotionName' in df.columns:
|
if 'PromotionName' in df.columns:
|
||||||
print(" ✅ Keeping 'PromotionName'")
|
print(" ✅ Keeping 'PromotionName'")
|
||||||
|
|
||||||
# BL > Convert PromotionStartDate into 4 columns
|
# BJ > Convert PromotionStartDate into 4 columns
|
||||||
if 'PromotionStartDate' in df.columns:
|
if 'PromotionStartDate' in df.columns:
|
||||||
date_components = df['PromotionStartDate'].apply(lambda x: extract_date_components(x, 'PromotionStart'))
|
date_components = df['PromotionStartDate'].apply(lambda x: extract_date_components(x, 'PromotionStart'))
|
||||||
date_df = pd.DataFrame(date_components.tolist())
|
date_df = pd.DataFrame(date_components.tolist())
|
||||||
@@ -530,32 +632,32 @@ def transform_dataframe(df):
|
|||||||
df = df.drop(columns=['PromotionStartDate'])
|
df = df.drop(columns=['PromotionStartDate'])
|
||||||
print(" ✅ Converted 'PromotionStartDate' into 4 columns (PromotionStart_Year, PromotionStart_Month, PromotionStart_TimeOfMonth, PromotionStart_Day)")
|
print(" ✅ Converted 'PromotionStartDate' into 4 columns (PromotionStart_Year, PromotionStart_Month, PromotionStart_TimeOfMonth, PromotionStart_Day)")
|
||||||
|
|
||||||
# BM > Drop PromotionEndDate
|
# BK > Drop PromotionEndDate
|
||||||
if 'PromotionEndDate' in df.columns:
|
if 'PromotionEndDate' in df.columns:
|
||||||
df = df.drop(columns=['PromotionEndDate'])
|
df = df.drop(columns=['PromotionEndDate'])
|
||||||
print(" 🗑️ Dropped 'PromotionEndDate'")
|
print(" 🗑️ Dropped 'PromotionEndDate'")
|
||||||
|
|
||||||
# BN > Drop PromotionOfferTypeId
|
# BL > Drop PromotionOfferTypeId
|
||||||
if 'PromotionOfferTypeId' in df.columns:
|
if 'PromotionOfferTypeId' in df.columns:
|
||||||
df = df.drop(columns=['PromotionOfferTypeId'])
|
df = df.drop(columns=['PromotionOfferTypeId'])
|
||||||
print(" 🗑️ Dropped 'PromotionOfferTypeId'")
|
print(" 🗑️ Dropped 'PromotionOfferTypeId'")
|
||||||
|
|
||||||
# BO > Drop PromotionOfferTypeName
|
# BM > Drop PromotionOfferTypeName
|
||||||
if 'PromotionOfferTypeName' in df.columns:
|
if 'PromotionOfferTypeName' in df.columns:
|
||||||
df = df.drop(columns=['PromotionOfferTypeName'])
|
df = df.drop(columns=['PromotionOfferTypeName'])
|
||||||
print(" 🗑️ Dropped 'PromotionOfferTypeName'")
|
print(" 🗑️ Dropped 'PromotionOfferTypeName'")
|
||||||
|
|
||||||
# BP > Drop PromotionSiteId
|
# BN > Drop PromotionSiteId
|
||||||
if 'PromotionSiteId' in df.columns:
|
if 'PromotionSiteId' in df.columns:
|
||||||
df = df.drop(columns=['PromotionSiteId'])
|
df = df.drop(columns=['PromotionSiteId'])
|
||||||
print(" 🗑️ Dropped 'PromotionSiteId'")
|
print(" 🗑️ Dropped 'PromotionSiteId'")
|
||||||
|
|
||||||
# BQ > Drop PromotionSite
|
# BO > Drop PromotionSite
|
||||||
if 'PromotionSite' in df.columns:
|
if 'PromotionSite' in df.columns:
|
||||||
df = df.drop(columns=['PromotionSite'])
|
df = df.drop(columns=['PromotionSite'])
|
||||||
print(" 🗑️ Dropped 'PromotionSite'")
|
print(" 🗑️ Dropped 'PromotionSite'")
|
||||||
|
|
||||||
# BR > Drop QualifyingProductQuantity
|
# BP > Drop QualifyingProductQuantity
|
||||||
if 'QualifyingProductQuantity' in df.columns:
|
if 'QualifyingProductQuantity' in df.columns:
|
||||||
df = df.drop(columns=['QualifyingProductQuantity'])
|
df = df.drop(columns=['QualifyingProductQuantity'])
|
||||||
print(" 🗑️ Dropped 'QualifyingProductQuantity'")
|
print(" 🗑️ Dropped 'QualifyingProductQuantity'")
|
||||||
@@ -619,64 +721,6 @@ def read_and_process_file(file_path, max_rows=5000):
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return None, None, None
|
return None, None, None
|
||||||
|
|
||||||
def encode_file_to_base64(file_content):
|
|
||||||
"""
|
|
||||||
Encode file content to base64 string
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
base64_encoded = base64.b64encode(file_content).decode('ascii')
|
|
||||||
return base64_encoded
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Error encoding to base64: {e}")
|
|
||||||
cleaned_content = bytes([b for b in file_content if b < 128])
|
|
||||||
base64_encoded = base64.b64encode(cleaned_content).decode('ascii')
|
|
||||||
return base64_encoded
|
|
||||||
|
|
||||||
def send_to_api(file_name, base64_data):
|
|
||||||
"""
|
|
||||||
Send the encoded file data to the API
|
|
||||||
"""
|
|
||||||
api_url = "https://problab-api-0004c00ee319.hosted.ghaymah.systems/process_dataset"
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"event": {
|
|
||||||
"data": {
|
|
||||||
"new": {
|
|
||||||
"id": "snipp_transformed",
|
|
||||||
"file_data": base64_data,
|
|
||||||
"file_name": file_name,
|
|
||||||
"hasHeader": True,
|
|
||||||
"delimiter": ","
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'User-Agent': 'Data-Transformer/1.0',
|
|
||||||
'Accept': 'application/json'
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
print(f"\n🔄 Sending transformed file '{file_name}' to API...")
|
|
||||||
print(f"📊 Base64 data size: {len(base64_data)} characters")
|
|
||||||
|
|
||||||
response = requests.post(api_url, json=payload, headers=headers, timeout=60)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
print("✅ File sent successfully!")
|
|
||||||
print(f"📋 Response status: {response.status_code}")
|
|
||||||
else:
|
|
||||||
print(f"❌ Failed to send file. Status code: {response.status_code}")
|
|
||||||
print(f"📋 Response: {response.text[:500]}")
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Error occurred while sending to API: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def save_clean_dataset(df, file_name):
|
def save_clean_dataset(df, file_name):
|
||||||
"""
|
"""
|
||||||
Save the transformed dataset locally
|
Save the transformed dataset locally
|
||||||
@@ -693,10 +737,10 @@ def save_clean_dataset(df, file_name):
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""
|
"""
|
||||||
Main function to execute all transformations and upload
|
Main function to execute all transformations
|
||||||
"""
|
"""
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
print("🚢 Ship Performance Dataset - Complete Transformation & Upload")
|
print("🚢 Ship Performance Dataset - Complete Transformation")
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
|
|
||||||
# Specify the path to your Excel file
|
# Specify the path to your Excel file
|
||||||
@@ -710,21 +754,12 @@ def main():
|
|||||||
print("\n❌ Process failed. Please check if the file exists.")
|
print("\n❌ Process failed. Please check if the file exists.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Encode to base64
|
|
||||||
print("\n2️⃣ Encoding transformed file to base64...")
|
|
||||||
base64_data = encode_file_to_base64(file_content)
|
|
||||||
print(f" ✅ Encoding complete ({len(base64_data)} characters)")
|
|
||||||
|
|
||||||
# Send to API
|
|
||||||
print("\n3️⃣ Sending transformed data to API...")
|
|
||||||
response = send_to_api(modified_file_name, base64_data)
|
|
||||||
|
|
||||||
# Save locally
|
# Save locally
|
||||||
save_clean_dataset(df, modified_file_name)
|
save_clean_dataset(df, modified_file_name)
|
||||||
|
|
||||||
# Save transformation summary
|
# Save transformation summary
|
||||||
summary_file = f'transformation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt'
|
summary_file = f'transformation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt'
|
||||||
with open(summary_file, 'w') as f:
|
with open(summary_file, 'w', encoding='utf-8') as f:
|
||||||
f.write("TRANSFORMATION SUMMARY\n")
|
f.write("TRANSFORMATION SUMMARY\n")
|
||||||
f.write("=" * 50 + "\n\n")
|
f.write("=" * 50 + "\n\n")
|
||||||
f.write(f"Original file: {excel_file_path}\n")
|
f.write(f"Original file: {excel_file_path}\n")
|
||||||
@@ -733,23 +768,34 @@ def main():
|
|||||||
f.write("Final columns list:\n")
|
f.write("Final columns list:\n")
|
||||||
for col in df.columns:
|
for col in df.columns:
|
||||||
f.write(f" - {col}\n")
|
f.write(f" - {col}\n")
|
||||||
|
f.write("\n" + "=" * 50 + "\n\n")
|
||||||
|
f.write("Key transformations applied:\n")
|
||||||
|
f.write(" - Added IsRecurringCustomer flag (based on multiple transactions per Userid)\n")
|
||||||
|
f.write(" - Converted DOB to Age (using TransactionDate as reference, not today's date)\n")
|
||||||
|
f.write(" - Converted RegistrationDate to DaysSinceRegistration (days between registration and transaction)\n")
|
||||||
|
f.write(" - Dropped CreateDate (duplicate of TransactionDate)\n")
|
||||||
|
f.write(" - Dynamically cleaned Brand names using prefix matching\n")
|
||||||
|
f.write(" - Cleaned SiteName variations\n")
|
||||||
|
f.write(" - Merged contact method columns into single ContactMethod field\n")
|
||||||
|
f.write(" - Split date columns into Year, Month, TimeOfMonth, Day components\n")
|
||||||
|
f.write(" - Removed redundant columns (StoreId, Store, SiteType, etc.)\n")
|
||||||
|
|
||||||
print(f"\n📄 Transformation summary saved: {summary_file}")
|
print(f"\n📄 Transformation summary saved: {summary_file}")
|
||||||
|
|
||||||
print("\n" + "=" * 80)
|
print("\n" + "=" * 80)
|
||||||
if response and response.status_code == 200:
|
print("🎉 All transformations completed successfully! إن شاء الله")
|
||||||
print("🎉 All transformations completed and file uploaded successfully! إن شاء الله")
|
print(f" ✅ {len(df)} rows processed")
|
||||||
print(f" ✅ {len(df)} rows processed")
|
print(f" ✅ {len(df.columns)} columns in final dataset")
|
||||||
print(f" ✅ {len(df.columns)} columns in final dataset")
|
print(" ✅ Recurring customer flag added")
|
||||||
print(" ✅ Recurring customer flag added")
|
print(" ✅ DOB converted to Age (using transaction date)")
|
||||||
print(" ✅ DOB converted to Age")
|
print(" ✅ RegistrationDate converted to DaysSinceRegistration")
|
||||||
print(" ✅ Contact methods merged")
|
print(" ✅ CreateDate dropped (duplicate)")
|
||||||
print(" ✅ Date columns split into components")
|
print(" ✅ Contact methods merged")
|
||||||
print(" ✅ SiteName and Brand cleaned")
|
print(" ✅ Date columns split into components")
|
||||||
else:
|
print(" ✅ SiteName and Brand dynamically cleaned")
|
||||||
print("⚠️ Process completed but API upload may have failed.")
|
|
||||||
print(" 💡 Transformed file saved locally for inspection.")
|
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|
||||||
المرجع في مشكلة جديدة
حظر مستخدم