Update transform.py

هذا الالتزام موجود في:
2026-04-28 17:54:16 +00:00
الأصل b42a70a3a4
التزام 16b39ae5ab

عرض الملف

@@ -4,15 +4,16 @@
# بسم الله الرحمن الرحيم
# بسم الله الرحمن الرحيم
import base64
import json
import requests
import os
import pandas as pd
import os
import numpy as np
from datetime import datetime
import io
import numpy as np
import re
from difflib import SequenceMatcher
def sanitize_text(value):
"""
@@ -71,47 +72,99 @@ def clean_site_name(name):
return name.title()
def clean_brand(brand):
def create_brand_mapping(series):
"""
Clean Brand names by standardizing similar values
Dynamically create brand mapping by analyzing unique brand names
Uses first N characters and similarity matching
"""
# Get unique brand values (excluding nulls and unknowns)
unique_brands = series.dropna().unique()
unique_brands = [str(b).strip() for b in unique_brands if str(b).strip() != "" and str(b).strip().lower() != "unknown"]
# Dictionary to store mappings
brand_map = {}
# First, group by first 3-5 characters
brand_groups = {}
for brand in unique_brands:
brand_lower = brand.lower()
# Try different prefix lengths
for prefix_len in [5, 4, 3]:
if len(brand_lower) >= prefix_len:
prefix = brand_lower[:prefix_len]
if prefix not in brand_groups:
brand_groups[prefix] = []
brand_groups[prefix].append(brand)
break
# For each group, find the most common/canonical name
for prefix, brands in brand_groups.items():
if len(brands) == 1:
# Single brand - use it as is (capitalized)
brand_map[brands[0].lower()] = brands[0].title()
else:
# Multiple brands with same prefix - find the most frequent or common one
# Count occurrences in the original series
brand_counts = series.value_counts()
# Find the brand with highest count in this group
best_match = max(brands, key=lambda b: brand_counts.get(b, 0))
canonical_name = best_match.title()
# Map all variations to the canonical name
for brand in brands:
brand_map[brand.lower()] = canonical_name
# Also check for brands that are substrings of others
sorted_brands = sorted(unique_brands, key=len, reverse=True)
for i, long_brand in enumerate(sorted_brands):
long_lower = long_brand.lower()
for short_brand in sorted_brands[i+1:]:
short_lower = short_brand.lower()
if short_lower in long_lower and len(short_lower) > 3:
# Short brand is a substring of long brand
if short_brand.lower() not in brand_map:
brand_map[short_lower] = long_brand.title()
return brand_map
def clean_brand_dynamic(brand, brand_mapping):
"""
Clean Brand names using dynamic mapping
"""
if pd.isna(brand) or brand == "":
return "Unknown"
brand = str(brand).strip().lower()
brand_str = str(brand).strip()
brand_lower = brand_str.lower()
# Brand variations mapping
brand_mapping = {
'nike': 'Nike',
'nik e': 'Nike',
'ni ke': 'Nike',
'adidas': 'Adidas',
'addidas': 'Adidas',
'adidas ': 'Adidas',
'puma': 'Puma',
'pum a': 'Puma',
'reebok': 'Reebok',
'reebok ': 'Reebok',
'reeb ok': 'Reebok',
'gucci': 'Gucci',
'gucc i': 'Gucci',
'chanel': 'Chanel',
'chan el': 'Chanel'
}
# Check if we have a mapping for this brand
if brand_lower in brand_mapping:
return brand_mapping[brand_lower]
# Try partial matching using first few characters
for key, value in brand_mapping.items():
if key in brand:
return value
# Check if brand starts with the same prefix
if len(brand_lower) >= 3 and len(key) >= 3:
if brand_lower[:3] == key[:3]:
return value
return brand.title()
# If not found, return title case
return brand_str.title()
def calculate_age_from_dob(dob_value):
def calculate_age_from_dob(dob_value, transaction_date):
"""
Convert DOB to age, handle 1900-01-01 as Unknown
Convert DOB to age based on transaction date, not today's date
Handles 1900-01-01 as Unknown
"""
if pd.isna(dob_value) or dob_value == "":
return "Unknown"
if pd.isna(transaction_date) or transaction_date == "":
return "Unknown"
dob_str = str(dob_value).strip()
# Check for the placeholder date
@@ -119,16 +172,25 @@ def calculate_age_from_dob(dob_value):
return "Unknown"
try:
# Try to parse the date
# Parse DOB
if '-' in dob_str:
dob = pd.to_datetime(dob_str.split()[0]) # Handle datetime strings
dob = pd.to_datetime(dob_str.split()[0])
elif '/' in dob_str:
dob = pd.to_datetime(dob_str)
else:
return "Unknown"
today = datetime.now()
age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
# Parse Transaction Date
trans_date_str = str(transaction_date).strip()
if '-' in trans_date_str:
trans_date = pd.to_datetime(trans_date_str.split()[0])
elif '/' in trans_date_str:
trans_date = pd.to_datetime(trans_date_str)
else:
return "Unknown"
# Calculate age at time of transaction
age = trans_date.year - dob.year - ((trans_date.month, trans_date.day) < (dob.month, dob.day))
if age < 0 or age > 120: # Sanity check
return "Unknown"
@@ -137,6 +199,48 @@ def calculate_age_from_dob(dob_value):
except:
return "Unknown"
def calculate_registration_duration(registration_date, transaction_date):
"""
Calculate number of days between registration and transaction
"""
if pd.isna(registration_date) or registration_date == "":
return "Unknown"
if pd.isna(transaction_date) or transaction_date == "":
return "Unknown"
try:
# Parse Registration Date
reg_str = str(registration_date).strip()
if '-' in reg_str:
reg_date = pd.to_datetime(reg_str.split()[0])
elif '/' in reg_str:
reg_date = pd.to_datetime(reg_str)
else:
return "Unknown"
# Parse Transaction Date
trans_str = str(transaction_date).strip()
if '-' in trans_str:
trans_date = pd.to_datetime(trans_str.split()[0])
elif '/' in trans_str:
trans_date = pd.to_datetime(trans_str)
else:
return "Unknown"
# Calculate days difference
days_diff = (trans_date - reg_date).days
if days_diff < 0:
return "0" # Transaction before registration - treat as 0
if days_diff > 3650: # Cap at 10 years (sanity check)
return "3650+"
return days_diff
except:
return "Unknown"
def merge_contact_methods(row):
"""
Merge Email, SMS, Mail, Phone into one column with priority order
@@ -157,7 +261,7 @@ def merge_contact_methods(row):
return ','.join(contact_methods) # Return all methods as comma-separated
def extract_date_components(date_value, column_name):
def extract_date_components(date_value, column_name, reference_date=None):
"""
Extract Year, Month, TimeOfMonth, Day from date
"""
@@ -269,15 +373,21 @@ def transform_dataframe(df):
if 'Gender' in df.columns:
print(" ✅ Keeping 'Gender'")
# G > Convert DOB to Age
if 'DOB' in df.columns:
df['Age'] = df['DOB'].apply(calculate_age_from_dob)
# G > Convert DOB to Age (using TransactionDate as reference)
if 'DOB' in df.columns and 'TransactionDate' in df.columns:
df['Age'] = df.apply(lambda row: calculate_age_from_dob(row['DOB'], row['TransactionDate']), axis=1)
df = df.drop(columns=['DOB'])
print(" ✅ Converted 'DOB' to 'Age' (1900-01-01 → Unknown)")
print(" ✅ Converted 'DOB' to 'Age' (using TransactionDate as reference)")
elif 'DOB' in df.columns:
print(" ⚠️ 'DOB' found but 'TransactionDate' missing - cannot calculate Age properly")
# H > Keep RegistrationDate
if 'RegistrationDate' in df.columns:
print(" ✅ Keeping 'RegistrationDate'")
# H > Convert RegistrationDate to duration (days since registration)
if 'RegistrationDate' in df.columns and 'TransactionDate' in df.columns:
df['DaysSinceRegistration'] = df.apply(lambda row: calculate_registration_duration(row['RegistrationDate'], row['TransactionDate']), axis=1)
df = df.drop(columns=['RegistrationDate'])
print(" ✅ Converted 'RegistrationDate' to 'DaysSinceRegistration' (days between registration and transaction)")
elif 'RegistrationDate' in df.columns:
print(" ⚠️ 'RegistrationDate' found but 'TransactionDate' missing - keeping as-is")
# I > Drop FirstLoginDate
if 'FirstLoginDate' in df.columns:
@@ -370,19 +480,18 @@ def transform_dataframe(df):
if 'Tier' in df.columns:
print(" ✅ Keeping 'Tier'")
# AE, AF > Merge TransactionDate and CreateDate into date components
date_columns_to_process = []
# AE > Convert TransactionDate into date components
if 'TransactionDate' in df.columns:
date_columns_to_process.append(('TransactionDate', 'Transaction'))
if 'CreateDate' in df.columns:
date_columns_to_process.append(('CreateDate', 'Create'))
for date_col, prefix in date_columns_to_process:
date_components = df[date_col].apply(lambda x: extract_date_components(x, prefix))
date_components = df['TransactionDate'].apply(lambda x: extract_date_components(x, 'Transaction'))
date_df = pd.DataFrame(date_components.tolist())
df = pd.concat([df, date_df], axis=1)
df = df.drop(columns=[date_col])
print(f" ✅ Converted '{date_col}' into 4 columns ({prefix}_Year, {prefix}_Month, {prefix}_TimeOfMonth, {prefix}_Day)")
df = df.drop(columns=['TransactionDate'])
print(" ✅ Converted 'TransactionDate' into 4 columns (Transaction_Year, Transaction_Month, Transaction_TimeOfMonth, Transaction_Day)")
# AF > Drop CreateDate (as requested - it's the same as TransactionDate)
if 'CreateDate' in df.columns:
df = df.drop(columns=['CreateDate'])
print(" 🗑️ Dropped 'CreateDate' (duplicate of TransactionDate)")
# AG > Drop MemberId
if 'MemberId' in df.columns:
@@ -394,135 +503,128 @@ def transform_dataframe(df):
df = df.drop(columns=['SiteId'])
print(" 🗑️ Dropped 'SiteId'")
# AI > Drop ParentSiteId
if 'ParentSiteId' in df.columns:
df = df.drop(columns=['ParentSiteId'])
print(" 🗑️ Dropped 'ParentSiteId'")
# AJ > Keep and clean SiteName
# AI > Clean and keep SiteName
if 'SiteName' in df.columns:
df['SiteName'] = df['SiteName'].apply(clean_site_name)
print(" ✅ Kept and cleaned 'SiteName'")
# AK > Drop SiteType
if 'SiteType' in df.columns:
df = df.drop(columns=['SiteType'])
print(" 🗑️ Dropped 'SiteType'")
# AL > Keep Quantity
# AJ > Keep Quantity
if 'Quantity' in df.columns:
print(" ✅ Keeping 'Quantity'")
# AM > Keep Amount
# AK > Keep Amount
if 'Amount' in df.columns:
print(" ✅ Keeping 'Amount'")
# AN > Drop RewardType
# AL > Drop RewardType
if 'RewardType' in df.columns:
df = df.drop(columns=['RewardType'])
print(" 🗑️ Dropped 'RewardType'")
# AO > Keep Points
# AM > Keep Points
if 'Points' in df.columns:
print(" ✅ Keeping 'Points'")
# AP > Drop trxDetailId
# AN > Drop trxDetailId
if 'trxDetailId' in df.columns:
df = df.drop(columns=['trxDetailId'])
print(" 🗑️ Dropped 'trxDetailId'")
# AQ > Drop TrxId
# AO > Drop TrxId
if 'TrxId' in df.columns:
df = df.drop(columns=['TrxId'])
print(" 🗑️ Dropped 'TrxId'")
# AR > Drop TransactionStatusId
# AP > Drop TransactionStatusId
if 'TransactionStatusId' in df.columns:
df = df.drop(columns=['TransactionStatusId'])
print(" 🗑️ Dropped 'TransactionStatusId'")
# AS > Keep TransactionStatusName
# AQ > Keep TransactionStatusName
if 'TransactionStatusName' in df.columns:
print(" ✅ Keeping 'TransactionStatusName'")
# AT > Drop TransactionTypeId
# AR > Drop TransactionTypeId
if 'TransactionTypeId' in df.columns:
df = df.drop(columns=['TransactionTypeId'])
print(" 🗑️ Dropped 'TransactionTypeId'")
# AU > Keep TransactionTypeName
# AS > Keep TransactionTypeName
if 'TransactionTypeName' in df.columns:
print(" ✅ Keeping 'TransactionTypeName'")
# AV > Drop Reportable
# AT > Drop Reportable
if 'Reportable' in df.columns:
df = df.drop(columns=['Reportable'])
print(" 🗑️ Dropped 'Reportable'")
# AW > Keep TransactionItemCode
# AU > Keep TransactionItemCode
if 'TransactionItemCode' in df.columns:
print(" ✅ Keeping 'TransactionItemCode'")
# AX > Keep AnalysisCode1
# AV > Keep AnalysisCode1
if 'AnalysisCode1' in df.columns:
print(" ✅ Keeping 'AnalysisCode1'")
# AY > Keep AnalysisCode2
# AW > Keep AnalysisCode2
if 'AnalysisCode2' in df.columns:
print(" ✅ Keeping 'AnalysisCode2'")
# AZ > Keep AnalysisCode3
# AX > Keep AnalysisCode3
if 'AnalysisCode3' in df.columns:
print(" ✅ Keeping 'AnalysisCode3'")
# BA > Keep AnalysisCode4
# AY > Keep AnalysisCode4
if 'AnalysisCode4' in df.columns:
print(" ✅ Keeping 'AnalysisCode4'")
# BB > Keep and clean Brand
# AZ > Dynamically clean Brand
if 'Brand' in df.columns:
df['Brand'] = df['Brand'].apply(clean_brand)
print(" ✅ Kept and cleaned 'Brand'")
print(" 🔍 Analyzing unique brand names to create dynamic mapping...")
brand_mapping = create_brand_mapping(df['Brand'])
print(f" 📊 Created mapping for {len(brand_mapping)} unique brand variations")
df['Brand'] = df['Brand'].apply(lambda x: clean_brand_dynamic(x, brand_mapping))
print(" ✅ Kept and dynamically cleaned 'Brand'")
# BC > Keep AnalysisCode6
# BA > Keep AnalysisCode6
if 'AnalysisCode6' in df.columns:
print(" ✅ Keeping 'AnalysisCode6'")
# BD > Keep AnalysisCode7
# BB > Keep AnalysisCode7
if 'AnalysisCode7' in df.columns:
print(" ✅ Keeping 'AnalysisCode7'")
# BE > Keep AnalysisCode8
# BC > Keep AnalysisCode8
if 'AnalysisCode8' in df.columns:
print(" ✅ Keeping 'AnalysisCode8'")
# BF > Keep Price
# BD > Keep Price
if 'Price' in df.columns:
print(" ✅ Keeping 'Price'")
# BG > Keep AnalysisCode10
# BE > Keep AnalysisCode10
if 'AnalysisCode10' in df.columns:
print(" ✅ Keeping 'AnalysisCode10'")
# BH > Keep InvalidReason
# BF > Keep InvalidReason
if 'InvalidReason' in df.columns:
print(" ✅ Keeping 'InvalidReason'")
# BI > Drop Description
# BG > Drop Description
if 'Description' in df.columns:
df = df.drop(columns=['Description'])
print(" 🗑️ Dropped 'Description'")
# BJ > Drop PromotionId
# BH > Drop PromotionId
if 'PromotionId' in df.columns:
df = df.drop(columns=['PromotionId'])
print(" 🗑️ Dropped 'PromotionId'")
# BK > Keep PromotionName
# BI > Keep PromotionName
if 'PromotionName' in df.columns:
print(" ✅ Keeping 'PromotionName'")
# BL > Convert PromotionStartDate into 4 columns
# BJ > Convert PromotionStartDate into 4 columns
if 'PromotionStartDate' in df.columns:
date_components = df['PromotionStartDate'].apply(lambda x: extract_date_components(x, 'PromotionStart'))
date_df = pd.DataFrame(date_components.tolist())
@@ -530,32 +632,32 @@ def transform_dataframe(df):
df = df.drop(columns=['PromotionStartDate'])
print(" ✅ Converted 'PromotionStartDate' into 4 columns (PromotionStart_Year, PromotionStart_Month, PromotionStart_TimeOfMonth, PromotionStart_Day)")
# BM > Drop PromotionEndDate
# BK > Drop PromotionEndDate
if 'PromotionEndDate' in df.columns:
df = df.drop(columns=['PromotionEndDate'])
print(" 🗑️ Dropped 'PromotionEndDate'")
# BN > Drop PromotionOfferTypeId
# BL > Drop PromotionOfferTypeId
if 'PromotionOfferTypeId' in df.columns:
df = df.drop(columns=['PromotionOfferTypeId'])
print(" 🗑️ Dropped 'PromotionOfferTypeId'")
# BO > Drop PromotionOfferTypeName
# BM > Drop PromotionOfferTypeName
if 'PromotionOfferTypeName' in df.columns:
df = df.drop(columns=['PromotionOfferTypeName'])
print(" 🗑️ Dropped 'PromotionOfferTypeName'")
# BP > Drop PromotionSiteId
# BN > Drop PromotionSiteId
if 'PromotionSiteId' in df.columns:
df = df.drop(columns=['PromotionSiteId'])
print(" 🗑️ Dropped 'PromotionSiteId'")
# BQ > Drop PromotionSite
# BO > Drop PromotionSite
if 'PromotionSite' in df.columns:
df = df.drop(columns=['PromotionSite'])
print(" 🗑️ Dropped 'PromotionSite'")
# BR > Drop QualifyingProductQuantity
# BP > Drop QualifyingProductQuantity
if 'QualifyingProductQuantity' in df.columns:
df = df.drop(columns=['QualifyingProductQuantity'])
print(" 🗑️ Dropped 'QualifyingProductQuantity'")
@@ -619,64 +721,6 @@ def read_and_process_file(file_path, max_rows=5000):
traceback.print_exc()
return None, None, None
def encode_file_to_base64(file_content):
"""
Encode file content to base64 string
"""
try:
base64_encoded = base64.b64encode(file_content).decode('ascii')
return base64_encoded
except Exception as e:
print(f"❌ Error encoding to base64: {e}")
cleaned_content = bytes([b for b in file_content if b < 128])
base64_encoded = base64.b64encode(cleaned_content).decode('ascii')
return base64_encoded
def send_to_api(file_name, base64_data):
"""
Send the encoded file data to the API
"""
api_url = "https://problab-api-0004c00ee319.hosted.ghaymah.systems/process_dataset"
payload = {
"event": {
"data": {
"new": {
"id": "snipp_transformed",
"file_data": base64_data,
"file_name": file_name,
"hasHeader": True,
"delimiter": ","
}
}
}
}
headers = {
'Content-Type': 'application/json',
'User-Agent': 'Data-Transformer/1.0',
'Accept': 'application/json'
}
try:
print(f"\n🔄 Sending transformed file '{file_name}' to API...")
print(f"📊 Base64 data size: {len(base64_data)} characters")
response = requests.post(api_url, json=payload, headers=headers, timeout=60)
if response.status_code == 200:
print("✅ File sent successfully!")
print(f"📋 Response status: {response.status_code}")
else:
print(f"❌ Failed to send file. Status code: {response.status_code}")
print(f"📋 Response: {response.text[:500]}")
return response
except Exception as e:
print(f"❌ Error occurred while sending to API: {e}")
return None
def save_clean_dataset(df, file_name):
"""
Save the transformed dataset locally
@@ -693,10 +737,10 @@ def save_clean_dataset(df, file_name):
def main():
"""
Main function to execute all transformations and upload
Main function to execute all transformations
"""
print("=" * 80)
print("🚢 Ship Performance Dataset - Complete Transformation & Upload")
print("🚢 Ship Performance Dataset - Complete Transformation")
print("=" * 80)
# Specify the path to your Excel file
@@ -710,21 +754,12 @@ def main():
print("\n❌ Process failed. Please check if the file exists.")
return
# Encode to base64
print("\n2⃣ Encoding transformed file to base64...")
base64_data = encode_file_to_base64(file_content)
print(f" ✅ Encoding complete ({len(base64_data)} characters)")
# Send to API
print("\n3⃣ Sending transformed data to API...")
response = send_to_api(modified_file_name, base64_data)
# Save locally
save_clean_dataset(df, modified_file_name)
# Save transformation summary
summary_file = f'transformation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt'
with open(summary_file, 'w') as f:
with open(summary_file, 'w', encoding='utf-8') as f:
f.write("TRANSFORMATION SUMMARY\n")
f.write("=" * 50 + "\n\n")
f.write(f"Original file: {excel_file_path}\n")
@@ -733,23 +768,34 @@ def main():
f.write("Final columns list:\n")
for col in df.columns:
f.write(f" - {col}\n")
f.write("\n" + "=" * 50 + "\n\n")
f.write("Key transformations applied:\n")
f.write(" - Added IsRecurringCustomer flag (based on multiple transactions per Userid)\n")
f.write(" - Converted DOB to Age (using TransactionDate as reference, not today's date)\n")
f.write(" - Converted RegistrationDate to DaysSinceRegistration (days between registration and transaction)\n")
f.write(" - Dropped CreateDate (duplicate of TransactionDate)\n")
f.write(" - Dynamically cleaned Brand names using prefix matching\n")
f.write(" - Cleaned SiteName variations\n")
f.write(" - Merged contact method columns into single ContactMethod field\n")
f.write(" - Split date columns into Year, Month, TimeOfMonth, Day components\n")
f.write(" - Removed redundant columns (StoreId, Store, SiteType, etc.)\n")
print(f"\n📄 Transformation summary saved: {summary_file}")
print("\n" + "=" * 80)
if response and response.status_code == 200:
print("🎉 All transformations completed and file uploaded successfully! إن شاء الله")
print(f"{len(df)} rows processed")
print(f"{len(df.columns)} columns in final dataset")
print("Recurring customer flag added")
print("DOB converted to Age")
print(" ✅ Contact methods merged")
print("Date columns split into components")
print("SiteName and Brand cleaned")
else:
print("⚠️ Process completed but API upload may have failed.")
print(" 💡 Transformed file saved locally for inspection.")
print("🎉 All transformations completed successfully! إن شاء الله")
print(f"{len(df)} rows processed")
print(f"{len(df.columns)} columns in final dataset")
print("Recurring customer flag added")
print("DOB converted to Age (using transaction date)")
print("RegistrationDate converted to DaysSinceRegistration")
print(" ✅ CreateDate dropped (duplicate)")
print("Contact methods merged")
print("Date columns split into components")
print(" ✅ SiteName and Brand dynamically cleaned")
print("=" * 80)
if __name__ == "__main__":
main()
main()