Update transform.py
هذا الالتزام موجود في:
408
transform.py
408
transform.py
@@ -4,15 +4,16 @@
|
||||
|
||||
# بسم الله الرحمن الرحيم
|
||||
|
||||
# بسم الله الرحمن الرحيم
|
||||
|
||||
import base64
|
||||
import json
|
||||
import requests
|
||||
import os
|
||||
import pandas as pd
|
||||
import os
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
import io
|
||||
import numpy as np
|
||||
import re
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
def sanitize_text(value):
|
||||
"""
|
||||
@@ -71,47 +72,99 @@ def clean_site_name(name):
|
||||
|
||||
return name.title()
|
||||
|
||||
def clean_brand(brand):
|
||||
def create_brand_mapping(series):
|
||||
"""
|
||||
Clean Brand names by standardizing similar values
|
||||
Dynamically create brand mapping by analyzing unique brand names
|
||||
Uses first N characters and similarity matching
|
||||
"""
|
||||
# Get unique brand values (excluding nulls and unknowns)
|
||||
unique_brands = series.dropna().unique()
|
||||
unique_brands = [str(b).strip() for b in unique_brands if str(b).strip() != "" and str(b).strip().lower() != "unknown"]
|
||||
|
||||
# Dictionary to store mappings
|
||||
brand_map = {}
|
||||
|
||||
# First, group by first 3-5 characters
|
||||
brand_groups = {}
|
||||
|
||||
for brand in unique_brands:
|
||||
brand_lower = brand.lower()
|
||||
|
||||
# Try different prefix lengths
|
||||
for prefix_len in [5, 4, 3]:
|
||||
if len(brand_lower) >= prefix_len:
|
||||
prefix = brand_lower[:prefix_len]
|
||||
if prefix not in brand_groups:
|
||||
brand_groups[prefix] = []
|
||||
brand_groups[prefix].append(brand)
|
||||
break
|
||||
|
||||
# For each group, find the most common/canonical name
|
||||
for prefix, brands in brand_groups.items():
|
||||
if len(brands) == 1:
|
||||
# Single brand - use it as is (capitalized)
|
||||
brand_map[brands[0].lower()] = brands[0].title()
|
||||
else:
|
||||
# Multiple brands with same prefix - find the most frequent or common one
|
||||
# Count occurrences in the original series
|
||||
brand_counts = series.value_counts()
|
||||
|
||||
# Find the brand with highest count in this group
|
||||
best_match = max(brands, key=lambda b: brand_counts.get(b, 0))
|
||||
canonical_name = best_match.title()
|
||||
|
||||
# Map all variations to the canonical name
|
||||
for brand in brands:
|
||||
brand_map[brand.lower()] = canonical_name
|
||||
|
||||
# Also check for brands that are substrings of others
|
||||
sorted_brands = sorted(unique_brands, key=len, reverse=True)
|
||||
for i, long_brand in enumerate(sorted_brands):
|
||||
long_lower = long_brand.lower()
|
||||
for short_brand in sorted_brands[i+1:]:
|
||||
short_lower = short_brand.lower()
|
||||
if short_lower in long_lower and len(short_lower) > 3:
|
||||
# Short brand is a substring of long brand
|
||||
if short_brand.lower() not in brand_map:
|
||||
brand_map[short_lower] = long_brand.title()
|
||||
|
||||
return brand_map
|
||||
|
||||
def clean_brand_dynamic(brand, brand_mapping):
|
||||
"""
|
||||
Clean Brand names using dynamic mapping
|
||||
"""
|
||||
if pd.isna(brand) or brand == "":
|
||||
return "Unknown"
|
||||
|
||||
brand = str(brand).strip().lower()
|
||||
brand_str = str(brand).strip()
|
||||
brand_lower = brand_str.lower()
|
||||
|
||||
# Brand variations mapping
|
||||
brand_mapping = {
|
||||
'nike': 'Nike',
|
||||
'nik e': 'Nike',
|
||||
'ni ke': 'Nike',
|
||||
'adidas': 'Adidas',
|
||||
'addidas': 'Adidas',
|
||||
'adidas ': 'Adidas',
|
||||
'puma': 'Puma',
|
||||
'pum a': 'Puma',
|
||||
'reebok': 'Reebok',
|
||||
'reebok ': 'Reebok',
|
||||
'reeb ok': 'Reebok',
|
||||
'gucci': 'Gucci',
|
||||
'gucc i': 'Gucci',
|
||||
'chanel': 'Chanel',
|
||||
'chan el': 'Chanel'
|
||||
}
|
||||
# Check if we have a mapping for this brand
|
||||
if brand_lower in brand_mapping:
|
||||
return brand_mapping[brand_lower]
|
||||
|
||||
# Try partial matching using first few characters
|
||||
for key, value in brand_mapping.items():
|
||||
if key in brand:
|
||||
return value
|
||||
# Check if brand starts with the same prefix
|
||||
if len(brand_lower) >= 3 and len(key) >= 3:
|
||||
if brand_lower[:3] == key[:3]:
|
||||
return value
|
||||
|
||||
return brand.title()
|
||||
# If not found, return title case
|
||||
return brand_str.title()
|
||||
|
||||
def calculate_age_from_dob(dob_value):
|
||||
def calculate_age_from_dob(dob_value, transaction_date):
|
||||
"""
|
||||
Convert DOB to age, handle 1900-01-01 as Unknown
|
||||
Convert DOB to age based on transaction date, not today's date
|
||||
Handles 1900-01-01 as Unknown
|
||||
"""
|
||||
if pd.isna(dob_value) or dob_value == "":
|
||||
return "Unknown"
|
||||
|
||||
if pd.isna(transaction_date) or transaction_date == "":
|
||||
return "Unknown"
|
||||
|
||||
dob_str = str(dob_value).strip()
|
||||
|
||||
# Check for the placeholder date
|
||||
@@ -119,16 +172,25 @@ def calculate_age_from_dob(dob_value):
|
||||
return "Unknown"
|
||||
|
||||
try:
|
||||
# Try to parse the date
|
||||
# Parse DOB
|
||||
if '-' in dob_str:
|
||||
dob = pd.to_datetime(dob_str.split()[0]) # Handle datetime strings
|
||||
dob = pd.to_datetime(dob_str.split()[0])
|
||||
elif '/' in dob_str:
|
||||
dob = pd.to_datetime(dob_str)
|
||||
else:
|
||||
return "Unknown"
|
||||
|
||||
today = datetime.now()
|
||||
age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
|
||||
# Parse Transaction Date
|
||||
trans_date_str = str(transaction_date).strip()
|
||||
if '-' in trans_date_str:
|
||||
trans_date = pd.to_datetime(trans_date_str.split()[0])
|
||||
elif '/' in trans_date_str:
|
||||
trans_date = pd.to_datetime(trans_date_str)
|
||||
else:
|
||||
return "Unknown"
|
||||
|
||||
# Calculate age at time of transaction
|
||||
age = trans_date.year - dob.year - ((trans_date.month, trans_date.day) < (dob.month, dob.day))
|
||||
|
||||
if age < 0 or age > 120: # Sanity check
|
||||
return "Unknown"
|
||||
@@ -137,6 +199,48 @@ def calculate_age_from_dob(dob_value):
|
||||
except:
|
||||
return "Unknown"
|
||||
|
||||
def calculate_registration_duration(registration_date, transaction_date):
|
||||
"""
|
||||
Calculate number of days between registration and transaction
|
||||
"""
|
||||
if pd.isna(registration_date) or registration_date == "":
|
||||
return "Unknown"
|
||||
|
||||
if pd.isna(transaction_date) or transaction_date == "":
|
||||
return "Unknown"
|
||||
|
||||
try:
|
||||
# Parse Registration Date
|
||||
reg_str = str(registration_date).strip()
|
||||
if '-' in reg_str:
|
||||
reg_date = pd.to_datetime(reg_str.split()[0])
|
||||
elif '/' in reg_str:
|
||||
reg_date = pd.to_datetime(reg_str)
|
||||
else:
|
||||
return "Unknown"
|
||||
|
||||
# Parse Transaction Date
|
||||
trans_str = str(transaction_date).strip()
|
||||
if '-' in trans_str:
|
||||
trans_date = pd.to_datetime(trans_str.split()[0])
|
||||
elif '/' in trans_str:
|
||||
trans_date = pd.to_datetime(trans_str)
|
||||
else:
|
||||
return "Unknown"
|
||||
|
||||
# Calculate days difference
|
||||
days_diff = (trans_date - reg_date).days
|
||||
|
||||
if days_diff < 0:
|
||||
return "0" # Transaction before registration - treat as 0
|
||||
|
||||
if days_diff > 3650: # Cap at 10 years (sanity check)
|
||||
return "3650+"
|
||||
|
||||
return days_diff
|
||||
except:
|
||||
return "Unknown"
|
||||
|
||||
def merge_contact_methods(row):
|
||||
"""
|
||||
Merge Email, SMS, Mail, Phone into one column with priority order
|
||||
@@ -157,7 +261,7 @@ def merge_contact_methods(row):
|
||||
|
||||
return ','.join(contact_methods) # Return all methods as comma-separated
|
||||
|
||||
def extract_date_components(date_value, column_name):
|
||||
def extract_date_components(date_value, column_name, reference_date=None):
|
||||
"""
|
||||
Extract Year, Month, TimeOfMonth, Day from date
|
||||
"""
|
||||
@@ -269,15 +373,21 @@ def transform_dataframe(df):
|
||||
if 'Gender' in df.columns:
|
||||
print(" ✅ Keeping 'Gender'")
|
||||
|
||||
# G > Convert DOB to Age
|
||||
if 'DOB' in df.columns:
|
||||
df['Age'] = df['DOB'].apply(calculate_age_from_dob)
|
||||
# G > Convert DOB to Age (using TransactionDate as reference)
|
||||
if 'DOB' in df.columns and 'TransactionDate' in df.columns:
|
||||
df['Age'] = df.apply(lambda row: calculate_age_from_dob(row['DOB'], row['TransactionDate']), axis=1)
|
||||
df = df.drop(columns=['DOB'])
|
||||
print(" ✅ Converted 'DOB' to 'Age' (1900-01-01 → Unknown)")
|
||||
print(" ✅ Converted 'DOB' to 'Age' (using TransactionDate as reference)")
|
||||
elif 'DOB' in df.columns:
|
||||
print(" ⚠️ 'DOB' found but 'TransactionDate' missing - cannot calculate Age properly")
|
||||
|
||||
# H > Keep RegistrationDate
|
||||
if 'RegistrationDate' in df.columns:
|
||||
print(" ✅ Keeping 'RegistrationDate'")
|
||||
# H > Convert RegistrationDate to duration (days since registration)
|
||||
if 'RegistrationDate' in df.columns and 'TransactionDate' in df.columns:
|
||||
df['DaysSinceRegistration'] = df.apply(lambda row: calculate_registration_duration(row['RegistrationDate'], row['TransactionDate']), axis=1)
|
||||
df = df.drop(columns=['RegistrationDate'])
|
||||
print(" ✅ Converted 'RegistrationDate' to 'DaysSinceRegistration' (days between registration and transaction)")
|
||||
elif 'RegistrationDate' in df.columns:
|
||||
print(" ⚠️ 'RegistrationDate' found but 'TransactionDate' missing - keeping as-is")
|
||||
|
||||
# I > Drop FirstLoginDate
|
||||
if 'FirstLoginDate' in df.columns:
|
||||
@@ -370,19 +480,18 @@ def transform_dataframe(df):
|
||||
if 'Tier' in df.columns:
|
||||
print(" ✅ Keeping 'Tier'")
|
||||
|
||||
# AE, AF > Merge TransactionDate and CreateDate into date components
|
||||
date_columns_to_process = []
|
||||
# AE > Convert TransactionDate into date components
|
||||
if 'TransactionDate' in df.columns:
|
||||
date_columns_to_process.append(('TransactionDate', 'Transaction'))
|
||||
if 'CreateDate' in df.columns:
|
||||
date_columns_to_process.append(('CreateDate', 'Create'))
|
||||
|
||||
for date_col, prefix in date_columns_to_process:
|
||||
date_components = df[date_col].apply(lambda x: extract_date_components(x, prefix))
|
||||
date_components = df['TransactionDate'].apply(lambda x: extract_date_components(x, 'Transaction'))
|
||||
date_df = pd.DataFrame(date_components.tolist())
|
||||
df = pd.concat([df, date_df], axis=1)
|
||||
df = df.drop(columns=[date_col])
|
||||
print(f" ✅ Converted '{date_col}' into 4 columns ({prefix}_Year, {prefix}_Month, {prefix}_TimeOfMonth, {prefix}_Day)")
|
||||
df = df.drop(columns=['TransactionDate'])
|
||||
print(" ✅ Converted 'TransactionDate' into 4 columns (Transaction_Year, Transaction_Month, Transaction_TimeOfMonth, Transaction_Day)")
|
||||
|
||||
# AF > Drop CreateDate (as requested - it's the same as TransactionDate)
|
||||
if 'CreateDate' in df.columns:
|
||||
df = df.drop(columns=['CreateDate'])
|
||||
print(" 🗑️ Dropped 'CreateDate' (duplicate of TransactionDate)")
|
||||
|
||||
# AG > Drop MemberId
|
||||
if 'MemberId' in df.columns:
|
||||
@@ -394,135 +503,128 @@ def transform_dataframe(df):
|
||||
df = df.drop(columns=['SiteId'])
|
||||
print(" 🗑️ Dropped 'SiteId'")
|
||||
|
||||
# AI > Drop ParentSiteId
|
||||
if 'ParentSiteId' in df.columns:
|
||||
df = df.drop(columns=['ParentSiteId'])
|
||||
print(" 🗑️ Dropped 'ParentSiteId'")
|
||||
|
||||
# AJ > Keep and clean SiteName
|
||||
# AI > Clean and keep SiteName
|
||||
if 'SiteName' in df.columns:
|
||||
df['SiteName'] = df['SiteName'].apply(clean_site_name)
|
||||
print(" ✅ Kept and cleaned 'SiteName'")
|
||||
|
||||
# AK > Drop SiteType
|
||||
if 'SiteType' in df.columns:
|
||||
df = df.drop(columns=['SiteType'])
|
||||
print(" 🗑️ Dropped 'SiteType'")
|
||||
|
||||
# AL > Keep Quantity
|
||||
# AJ > Keep Quantity
|
||||
if 'Quantity' in df.columns:
|
||||
print(" ✅ Keeping 'Quantity'")
|
||||
|
||||
# AM > Keep Amount
|
||||
# AK > Keep Amount
|
||||
if 'Amount' in df.columns:
|
||||
print(" ✅ Keeping 'Amount'")
|
||||
|
||||
# AN > Drop RewardType
|
||||
# AL > Drop RewardType
|
||||
if 'RewardType' in df.columns:
|
||||
df = df.drop(columns=['RewardType'])
|
||||
print(" 🗑️ Dropped 'RewardType'")
|
||||
|
||||
# AO > Keep Points
|
||||
# AM > Keep Points
|
||||
if 'Points' in df.columns:
|
||||
print(" ✅ Keeping 'Points'")
|
||||
|
||||
# AP > Drop trxDetailId
|
||||
# AN > Drop trxDetailId
|
||||
if 'trxDetailId' in df.columns:
|
||||
df = df.drop(columns=['trxDetailId'])
|
||||
print(" 🗑️ Dropped 'trxDetailId'")
|
||||
|
||||
# AQ > Drop TrxId
|
||||
# AO > Drop TrxId
|
||||
if 'TrxId' in df.columns:
|
||||
df = df.drop(columns=['TrxId'])
|
||||
print(" 🗑️ Dropped 'TrxId'")
|
||||
|
||||
# AR > Drop TransactionStatusId
|
||||
# AP > Drop TransactionStatusId
|
||||
if 'TransactionStatusId' in df.columns:
|
||||
df = df.drop(columns=['TransactionStatusId'])
|
||||
print(" 🗑️ Dropped 'TransactionStatusId'")
|
||||
|
||||
# AS > Keep TransactionStatusName
|
||||
# AQ > Keep TransactionStatusName
|
||||
if 'TransactionStatusName' in df.columns:
|
||||
print(" ✅ Keeping 'TransactionStatusName'")
|
||||
|
||||
# AT > Drop TransactionTypeId
|
||||
# AR > Drop TransactionTypeId
|
||||
if 'TransactionTypeId' in df.columns:
|
||||
df = df.drop(columns=['TransactionTypeId'])
|
||||
print(" 🗑️ Dropped 'TransactionTypeId'")
|
||||
|
||||
# AU > Keep TransactionTypeName
|
||||
# AS > Keep TransactionTypeName
|
||||
if 'TransactionTypeName' in df.columns:
|
||||
print(" ✅ Keeping 'TransactionTypeName'")
|
||||
|
||||
# AV > Drop Reportable
|
||||
# AT > Drop Reportable
|
||||
if 'Reportable' in df.columns:
|
||||
df = df.drop(columns=['Reportable'])
|
||||
print(" 🗑️ Dropped 'Reportable'")
|
||||
|
||||
# AW > Keep TransactionItemCode
|
||||
# AU > Keep TransactionItemCode
|
||||
if 'TransactionItemCode' in df.columns:
|
||||
print(" ✅ Keeping 'TransactionItemCode'")
|
||||
|
||||
# AX > Keep AnalysisCode1
|
||||
# AV > Keep AnalysisCode1
|
||||
if 'AnalysisCode1' in df.columns:
|
||||
print(" ✅ Keeping 'AnalysisCode1'")
|
||||
|
||||
# AY > Keep AnalysisCode2
|
||||
# AW > Keep AnalysisCode2
|
||||
if 'AnalysisCode2' in df.columns:
|
||||
print(" ✅ Keeping 'AnalysisCode2'")
|
||||
|
||||
# AZ > Keep AnalysisCode3
|
||||
# AX > Keep AnalysisCode3
|
||||
if 'AnalysisCode3' in df.columns:
|
||||
print(" ✅ Keeping 'AnalysisCode3'")
|
||||
|
||||
# BA > Keep AnalysisCode4
|
||||
# AY > Keep AnalysisCode4
|
||||
if 'AnalysisCode4' in df.columns:
|
||||
print(" ✅ Keeping 'AnalysisCode4'")
|
||||
|
||||
# BB > Keep and clean Brand
|
||||
# AZ > Dynamically clean Brand
|
||||
if 'Brand' in df.columns:
|
||||
df['Brand'] = df['Brand'].apply(clean_brand)
|
||||
print(" ✅ Kept and cleaned 'Brand'")
|
||||
print(" 🔍 Analyzing unique brand names to create dynamic mapping...")
|
||||
brand_mapping = create_brand_mapping(df['Brand'])
|
||||
print(f" 📊 Created mapping for {len(brand_mapping)} unique brand variations")
|
||||
df['Brand'] = df['Brand'].apply(lambda x: clean_brand_dynamic(x, brand_mapping))
|
||||
print(" ✅ Kept and dynamically cleaned 'Brand'")
|
||||
|
||||
# BC > Keep AnalysisCode6
|
||||
# BA > Keep AnalysisCode6
|
||||
if 'AnalysisCode6' in df.columns:
|
||||
print(" ✅ Keeping 'AnalysisCode6'")
|
||||
|
||||
# BD > Keep AnalysisCode7
|
||||
# BB > Keep AnalysisCode7
|
||||
if 'AnalysisCode7' in df.columns:
|
||||
print(" ✅ Keeping 'AnalysisCode7'")
|
||||
|
||||
# BE > Keep AnalysisCode8
|
||||
# BC > Keep AnalysisCode8
|
||||
if 'AnalysisCode8' in df.columns:
|
||||
print(" ✅ Keeping 'AnalysisCode8'")
|
||||
|
||||
# BF > Keep Price
|
||||
# BD > Keep Price
|
||||
if 'Price' in df.columns:
|
||||
print(" ✅ Keeping 'Price'")
|
||||
|
||||
# BG > Keep AnalysisCode10
|
||||
# BE > Keep AnalysisCode10
|
||||
if 'AnalysisCode10' in df.columns:
|
||||
print(" ✅ Keeping 'AnalysisCode10'")
|
||||
|
||||
# BH > Keep InvalidReason
|
||||
# BF > Keep InvalidReason
|
||||
if 'InvalidReason' in df.columns:
|
||||
print(" ✅ Keeping 'InvalidReason'")
|
||||
|
||||
# BI > Drop Description
|
||||
# BG > Drop Description
|
||||
if 'Description' in df.columns:
|
||||
df = df.drop(columns=['Description'])
|
||||
print(" 🗑️ Dropped 'Description'")
|
||||
|
||||
# BJ > Drop PromotionId
|
||||
# BH > Drop PromotionId
|
||||
if 'PromotionId' in df.columns:
|
||||
df = df.drop(columns=['PromotionId'])
|
||||
print(" 🗑️ Dropped 'PromotionId'")
|
||||
|
||||
# BK > Keep PromotionName
|
||||
# BI > Keep PromotionName
|
||||
if 'PromotionName' in df.columns:
|
||||
print(" ✅ Keeping 'PromotionName'")
|
||||
|
||||
# BL > Convert PromotionStartDate into 4 columns
|
||||
# BJ > Convert PromotionStartDate into 4 columns
|
||||
if 'PromotionStartDate' in df.columns:
|
||||
date_components = df['PromotionStartDate'].apply(lambda x: extract_date_components(x, 'PromotionStart'))
|
||||
date_df = pd.DataFrame(date_components.tolist())
|
||||
@@ -530,32 +632,32 @@ def transform_dataframe(df):
|
||||
df = df.drop(columns=['PromotionStartDate'])
|
||||
print(" ✅ Converted 'PromotionStartDate' into 4 columns (PromotionStart_Year, PromotionStart_Month, PromotionStart_TimeOfMonth, PromotionStart_Day)")
|
||||
|
||||
# BM > Drop PromotionEndDate
|
||||
# BK > Drop PromotionEndDate
|
||||
if 'PromotionEndDate' in df.columns:
|
||||
df = df.drop(columns=['PromotionEndDate'])
|
||||
print(" 🗑️ Dropped 'PromotionEndDate'")
|
||||
|
||||
# BN > Drop PromotionOfferTypeId
|
||||
# BL > Drop PromotionOfferTypeId
|
||||
if 'PromotionOfferTypeId' in df.columns:
|
||||
df = df.drop(columns=['PromotionOfferTypeId'])
|
||||
print(" 🗑️ Dropped 'PromotionOfferTypeId'")
|
||||
|
||||
# BO > Drop PromotionOfferTypeName
|
||||
# BM > Drop PromotionOfferTypeName
|
||||
if 'PromotionOfferTypeName' in df.columns:
|
||||
df = df.drop(columns=['PromotionOfferTypeName'])
|
||||
print(" 🗑️ Dropped 'PromotionOfferTypeName'")
|
||||
|
||||
# BP > Drop PromotionSiteId
|
||||
# BN > Drop PromotionSiteId
|
||||
if 'PromotionSiteId' in df.columns:
|
||||
df = df.drop(columns=['PromotionSiteId'])
|
||||
print(" 🗑️ Dropped 'PromotionSiteId'")
|
||||
|
||||
# BQ > Drop PromotionSite
|
||||
# BO > Drop PromotionSite
|
||||
if 'PromotionSite' in df.columns:
|
||||
df = df.drop(columns=['PromotionSite'])
|
||||
print(" 🗑️ Dropped 'PromotionSite'")
|
||||
|
||||
# BR > Drop QualifyingProductQuantity
|
||||
# BP > Drop QualifyingProductQuantity
|
||||
if 'QualifyingProductQuantity' in df.columns:
|
||||
df = df.drop(columns=['QualifyingProductQuantity'])
|
||||
print(" 🗑️ Dropped 'QualifyingProductQuantity'")
|
||||
@@ -619,64 +721,6 @@ def read_and_process_file(file_path, max_rows=5000):
|
||||
traceback.print_exc()
|
||||
return None, None, None
|
||||
|
||||
def encode_file_to_base64(file_content):
|
||||
"""
|
||||
Encode file content to base64 string
|
||||
"""
|
||||
try:
|
||||
base64_encoded = base64.b64encode(file_content).decode('ascii')
|
||||
return base64_encoded
|
||||
except Exception as e:
|
||||
print(f"❌ Error encoding to base64: {e}")
|
||||
cleaned_content = bytes([b for b in file_content if b < 128])
|
||||
base64_encoded = base64.b64encode(cleaned_content).decode('ascii')
|
||||
return base64_encoded
|
||||
|
||||
def send_to_api(file_name, base64_data):
|
||||
"""
|
||||
Send the encoded file data to the API
|
||||
"""
|
||||
api_url = "https://problab-api-0004c00ee319.hosted.ghaymah.systems/process_dataset"
|
||||
|
||||
payload = {
|
||||
"event": {
|
||||
"data": {
|
||||
"new": {
|
||||
"id": "snipp_transformed",
|
||||
"file_data": base64_data,
|
||||
"file_name": file_name,
|
||||
"hasHeader": True,
|
||||
"delimiter": ","
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'User-Agent': 'Data-Transformer/1.0',
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
|
||||
try:
|
||||
print(f"\n🔄 Sending transformed file '{file_name}' to API...")
|
||||
print(f"📊 Base64 data size: {len(base64_data)} characters")
|
||||
|
||||
response = requests.post(api_url, json=payload, headers=headers, timeout=60)
|
||||
|
||||
if response.status_code == 200:
|
||||
print("✅ File sent successfully!")
|
||||
print(f"📋 Response status: {response.status_code}")
|
||||
else:
|
||||
print(f"❌ Failed to send file. Status code: {response.status_code}")
|
||||
print(f"📋 Response: {response.text[:500]}")
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error occurred while sending to API: {e}")
|
||||
return None
|
||||
|
||||
def save_clean_dataset(df, file_name):
|
||||
"""
|
||||
Save the transformed dataset locally
|
||||
@@ -693,10 +737,10 @@ def save_clean_dataset(df, file_name):
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to execute all transformations and upload
|
||||
Main function to execute all transformations
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("🚢 Ship Performance Dataset - Complete Transformation & Upload")
|
||||
print("🚢 Ship Performance Dataset - Complete Transformation")
|
||||
print("=" * 80)
|
||||
|
||||
# Specify the path to your Excel file
|
||||
@@ -710,21 +754,12 @@ def main():
|
||||
print("\n❌ Process failed. Please check if the file exists.")
|
||||
return
|
||||
|
||||
# Encode to base64
|
||||
print("\n2️⃣ Encoding transformed file to base64...")
|
||||
base64_data = encode_file_to_base64(file_content)
|
||||
print(f" ✅ Encoding complete ({len(base64_data)} characters)")
|
||||
|
||||
# Send to API
|
||||
print("\n3️⃣ Sending transformed data to API...")
|
||||
response = send_to_api(modified_file_name, base64_data)
|
||||
|
||||
# Save locally
|
||||
save_clean_dataset(df, modified_file_name)
|
||||
|
||||
# Save transformation summary
|
||||
summary_file = f'transformation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt'
|
||||
with open(summary_file, 'w') as f:
|
||||
with open(summary_file, 'w', encoding='utf-8') as f:
|
||||
f.write("TRANSFORMATION SUMMARY\n")
|
||||
f.write("=" * 50 + "\n\n")
|
||||
f.write(f"Original file: {excel_file_path}\n")
|
||||
@@ -733,23 +768,34 @@ def main():
|
||||
f.write("Final columns list:\n")
|
||||
for col in df.columns:
|
||||
f.write(f" - {col}\n")
|
||||
f.write("\n" + "=" * 50 + "\n\n")
|
||||
f.write("Key transformations applied:\n")
|
||||
f.write(" - Added IsRecurringCustomer flag (based on multiple transactions per Userid)\n")
|
||||
f.write(" - Converted DOB to Age (using TransactionDate as reference, not today's date)\n")
|
||||
f.write(" - Converted RegistrationDate to DaysSinceRegistration (days between registration and transaction)\n")
|
||||
f.write(" - Dropped CreateDate (duplicate of TransactionDate)\n")
|
||||
f.write(" - Dynamically cleaned Brand names using prefix matching\n")
|
||||
f.write(" - Cleaned SiteName variations\n")
|
||||
f.write(" - Merged contact method columns into single ContactMethod field\n")
|
||||
f.write(" - Split date columns into Year, Month, TimeOfMonth, Day components\n")
|
||||
f.write(" - Removed redundant columns (StoreId, Store, SiteType, etc.)\n")
|
||||
|
||||
print(f"\n📄 Transformation summary saved: {summary_file}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
if response and response.status_code == 200:
|
||||
print("🎉 All transformations completed and file uploaded successfully! إن شاء الله")
|
||||
print(f" ✅ {len(df)} rows processed")
|
||||
print(f" ✅ {len(df.columns)} columns in final dataset")
|
||||
print(" ✅ Recurring customer flag added")
|
||||
print(" ✅ DOB converted to Age")
|
||||
print(" ✅ Contact methods merged")
|
||||
print(" ✅ Date columns split into components")
|
||||
print(" ✅ SiteName and Brand cleaned")
|
||||
else:
|
||||
print("⚠️ Process completed but API upload may have failed.")
|
||||
print(" 💡 Transformed file saved locally for inspection.")
|
||||
print("🎉 All transformations completed successfully! إن شاء الله")
|
||||
print(f" ✅ {len(df)} rows processed")
|
||||
print(f" ✅ {len(df.columns)} columns in final dataset")
|
||||
print(" ✅ Recurring customer flag added")
|
||||
print(" ✅ DOB converted to Age (using transaction date)")
|
||||
print(" ✅ RegistrationDate converted to DaysSinceRegistration")
|
||||
print(" ✅ CreateDate dropped (duplicate)")
|
||||
print(" ✅ Contact methods merged")
|
||||
print(" ✅ Date columns split into components")
|
||||
print(" ✅ SiteName and Brand dynamically cleaned")
|
||||
print("=" * 80)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
|
||||
المرجع في مشكلة جديدة
حظر مستخدم