#!/usr/bin/env python3

import csv
from collections import defaultdict
from datetime import datetime

def clean_amount(amount_str):
    """Convert amount string to float, handling negative amounts"""
    if not amount_str or amount_str == '':
        return 0.0
    
    amount_str = str(amount_str).replace(',', '').replace(' ', '').strip()
    
    if amount_str == '' or amount_str == '-' or amount_str == 'nan':
        return 0.0
    
    try:
        return abs(float(amount_str))  # Use absolute value for credit card negatives
    except:
        return 0.0

def categorize_expense_improved(vendor_name, amount=0, cardholder="", source=""):
    """Improved categorization to reduce 'Review Needed' items"""
    vendor = vendor_name.lower()
    cardholder = cardholder.lower()
    
    # Software/Technology - Atomic Elevator
    tech_keywords = ['apple', 'landbot', 'stamps.com', 'github', 'aws', 'microsoft', 
                     'adobe', 'dropbox', 'docusign', 'zapier', 'grammarly', 'linkedin',
                     'netflix', 'cleverbridg', 'avast', 'wisestamp']
    if any(x in vendor for x in tech_keywords):
        return 'Software/Technology', 'Atomic Elevator'
    
    # Marketing/Creative - Space Monkey Partners  
    marketing_keywords = ['canva', 'facebook', 'facebk', 'shutterstoc', 'constant contact']
    if any(x in vendor for x in marketing_keywords):
        return 'Marketing/Content', 'Space Monkey Partners'
    
    # Travel/Business - Shared
    travel_keywords = ['aviation', 'lowe aviation', 'airline', 'hotel', 'pollard', 'airbnb', 'uber', 'lyft']
    if any(x in vendor for x in travel_keywords):
        return 'Travel', 'Shared'
    
    # Food/Meals - Context-dependent
    food_keywords = ['bogarts', 'restaurant', 'cafe', 'starbucks', 'sq *', 'tst*']
    if any(x in vendor for x in food_keywords):
        if amount > 50:
            return 'Meals/Entertainment', 'Shared'  # Business meals
        else:
            return 'Meals/Entertainment', 'Personal'  # Personal meals
    
    # Personal services - Non-deductible
    personal_keywords = ['barber', 'salon', 'spa', 'gym', 'fitness']
    if any(x in vendor for x in personal_keywords):
        return 'Personal Services', 'Personal'
    
    # Bank/PayPal transfers and subscriptions
    if 'paypal inst xfer' in vendor:
        # Try to categorize PayPal transfers by what follows
        if any(x in vendor for x in ['spotify', 'netflix', 'hulu', 'disney', 'peacock']):
            return 'Entertainment/Personal', 'Personal'
        elif any(x in vendor for x in ['zapier', 'dropbox', 'docusign']):
            return 'Software/Technology', 'Atomic Elevator'
        elif 'shutterstoc' in vendor:
            return 'Marketing/Content', 'Space Monkey Partners'
        else:
            return 'Software/Technology', 'Review Needed'
    
    # Payroll and benefits
    if any(x in vendor for x in ['gusto', 'guideline']):
        return 'Payroll/Benefits', 'Shared'
    
    # Business credit card payments (Divvy)
    if 'divvy' in vendor:
        return 'Business Credit Card', 'Shared'
    
    # Insurance and utilities
    if any(x in vendor for x in ['insurance', 'spectrum', 'state farm']):
        return 'Office/Utilities', 'Shared'
    
    # Employee expenses (Alayna)
    if 'alayna' in cardholder:
        if any(x in vendor for x in ['canva', 'facebook']):
            return 'Marketing/Content', 'Space Monkey Partners'
        else:
            return 'Employee Expenses', 'Shared'
    
    # Default for bank transfers that we can't categorize
    if source == 'Bank Statement' and any(x in vendor for x in ['internet banking transfer', 'check']):
        return 'Internal Transfer', 'Internal'
    
    # Default
    return 'Other', 'Review Needed'

def process_corrected_credit_card(csv_file):
    """Process credit card with corrected column mapping"""
    transactions = []
    
    try:
        with open(csv_file, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            
            for row in reader:
                # CORRECTED mapping based on our discovery:
                # Amount is in "Clean Merchant Name" column
                # Merchant is in "Merchant" column
                date_str = row.get('Date (UTC)', '')
                amount = clean_amount(row.get('Clean Merchant Name', ''))
                merchant = row.get('Merchant', '')
                first_name = row.get('First Name', '')
                last_name = row.get('Last Name', '')
                cardholder = f"{first_name} {last_name}".strip()
                card_last_4 = row.get('Card Last 4', '').replace("'", "")
                
                # Skip invalid data
                if amount <= 0 or not merchant:
                    continue
                
                # Parse date
                try:
                    date = datetime.strptime(date_str.split()[0], '%Y-%m-%d')
                except:
                    continue
                
                # Filter to 2025-2026
                if date.year < 2025:
                    continue
                
                # Categorize
                category, business = categorize_expense_improved(merchant, amount, cardholder, 'Credit Card')
                
                transactions.append({
                    'date': date,
                    'amount': amount,
                    'vendor': merchant,
                    'cardholder': cardholder,
                    'category': category,
                    'business': business,
                    'source': f'Credit Card {card_last_4}',
                    'card_identifier': card_last_4
                })
                
    except Exception as e:
        print(f"ERROR processing {csv_file}: {e}")
        return []
    
    return transactions

def process_bank_statement_improved():
    """Process bank statement with improved categorization"""
    csv_file = '/home/ubuntu/.openclaw/media/inbound/file_293---5c15fc2d-5f20-4d35-972b-6da60ad73643.csv'
    transactions = []
    
    try:
        with open(csv_file, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            
            for row in reader:
                if not any(row.values()):
                    continue
                    
                date_str = row.get('Date', '')
                debit = clean_amount(row.get('Debit', ''))
                description = row.get('Description', '')
                
                try:
                    date = datetime.strptime(date_str, '%m/%d/%Y')
                except:
                    continue
                
                # Filter out credit card payments and internal transfers
                if (debit > 0 and 
                    not any(x in description.lower() for x in 
                    ['amex epayment', 'capital one crcardpmt', 'applecard gsbank', 
                     'jpmorgan chase', 'internet banking transfer', 'deposit', 
                     'mercury', 'cornerstone', 'space monkey par', 'atomic elevator',
                     'mobile capture', 'check #', 'divvy cred ewallet'])):
                    
                    # Improved categorization
                    category, business = categorize_expense_improved(description, debit, '', 'Bank Statement')
                    
                    # Only include actual business expenses, not internal transfers
                    if category != 'Internal Transfer':
                        transactions.append({
                            'date': date,
                            'amount': debit,
                            'vendor': description,
                            'cardholder': 'Bank Transfer',
                            'category': category,
                            'business': business,
                            'source': 'Bank Statement',
                            'card_identifier': 'Bank'
                        })
                        
    except Exception as e:
        print(f"ERROR processing bank statement: {e}")
        return []
    
    return transactions

def validate_data(transactions):
    """Validate the processed data for errors"""
    print("🔍 DATA VALIDATION:")
    
    # Check for zero amounts
    zero_amounts = [t for t in transactions if t['amount'] <= 0]
    if zero_amounts:
        print(f"  ❌ Found {len(zero_amounts)} transactions with zero/negative amounts")
    else:
        print(f"  ✅ No zero amount transactions")
    
    # Check for empty vendors
    empty_vendors = [t for t in transactions if not t['vendor']]
    if empty_vendors:
        print(f"  ❌ Found {len(empty_vendors)} transactions with empty vendors")
    else:
        print(f"  ✅ All transactions have vendor names")
    
    # Check date range
    dates = [t['date'] for t in transactions]
    if dates:
        min_date = min(dates)
        max_date = max(dates)
        print(f"  📅 Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
    
    # Check for reasonable amounts (not over $50k per transaction)
    large_amounts = [t for t in transactions if t['amount'] > 50000]
    if large_amounts:
        print(f"  ⚠️  Found {len(large_amounts)} transactions over $50,000")
        for t in large_amounts[:3]:
            print(f"    - ${t['amount']:,.2f}: {t['vendor'][:50]}")
    
    print()

def main():
    print("🧹 CLEAN FINANCIAL ANALYSIS (Error-Free Version)")
    print("=" * 60)
    
    all_transactions = []
    
    # Process bank statement
    print("1. Processing bank statement...")
    bank_transactions = process_bank_statement_improved()
    all_transactions.extend(bank_transactions)
    print(f"   ✅ Bank: {len(bank_transactions)} transactions")
    
    # Process credit card 1
    print("2. Processing credit card 1...")
    cc1_transactions = process_corrected_credit_card('credit_card_1.csv')
    all_transactions.extend(cc1_transactions)
    print(f"   ✅ Credit Card 1: {len(cc1_transactions)} transactions")
    
    # Process credit card 2
    print("3. Processing credit card 2...")
    cc2_transactions = process_corrected_credit_card('credit_card_2.csv')
    all_transactions.extend(cc2_transactions)
    print(f"   ✅ Credit Card 2: {len(cc2_transactions)} transactions")
    
    # Validate data
    validate_data(all_transactions)
    
    # Calculate totals
    total_expenses = sum(t['amount'] for t in all_transactions)
    
    # Generate summaries
    source_totals = defaultdict(float)
    business_totals = defaultdict(float)
    category_totals = defaultdict(float)
    
    for t in all_transactions:
        source_totals[t['source']] += t['amount']
        business_totals[t['business']] += t['amount']
        category_totals[t['category']] += t['amount']
    
    print("📊 FINANCIAL SUMMARY")
    print("=" * 60)
    print(f"Total Expenses: ${total_expenses:,.2f}")
    print(f"Total Transactions: {len(all_transactions):,}")
    print()
    
    print("💳 BY SOURCE:")
    for source, amount in sorted(source_totals.items(), key=lambda x: x[1], reverse=True):
        pct = (amount / total_expenses) * 100 if total_expenses > 0 else 0
        print(f"  {source}: ${amount:,.2f} ({pct:.1f}%)")
    print()
    
    print("🏢 BY BUSINESS:")
    for business, amount in sorted(business_totals.items(), key=lambda x: x[1], reverse=True):
        pct = (amount / total_expenses) * 100 if total_expenses > 0 else 0
        print(f"  {business}: ${amount:,.2f} ({pct:.1f}%)")
    print()
    
    print("📂 BY CATEGORY:")
    for category, amount in sorted(category_totals.items(), key=lambda x: x[1], reverse=True):
        pct = (amount / total_expenses) * 100 if total_expenses > 0 else 0
        print(f"  {category}: ${amount:,.2f} ({pct:.1f}%)")
    print()
    
    # Tax calculation
    atomic_direct = business_totals.get('Atomic Elevator', 0)
    smp_direct = business_totals.get('Space Monkey Partners', 0)
    shared = business_totals.get('Shared', 0)
    personal = business_totals.get('Personal', 0)
    review_needed = business_totals.get('Review Needed', 0)
    
    atomic_total = atomic_direct + (shared * 0.5)
    smp_total = smp_direct + (shared * 0.5)
    
    print("🧾 TAX SUMMARY:")
    print(f"  Atomic Elevator: ${atomic_total:,.2f}")
    print(f"  Space Monkey Partners: ${smp_total:,.2f}")
    print(f"  Personal (non-deductible): ${personal:,.2f}")
    print(f"  Needs review: ${review_needed:,.2f}")
    print()
    
    # Export clean CSV
    csv_file = 'CLEAN_COMPLETE_EXPENSES_2025-2026.csv'
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        fieldnames = ['date', 'amount', 'vendor', 'cardholder', 'category', 'business', 'source']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        
        writer.writeheader()
        for t in sorted(all_transactions, key=lambda x: x['date'], reverse=True):
            writer.writerow({
                'date': t['date'].strftime('%Y-%m-%d'),
                'amount': f"{t['amount']:.2f}",
                'vendor': t['vendor'],
                'cardholder': t['cardholder'],
                'category': t['category'],
                'business': t['business'],
                'source': t['source']
            })
    
    print(f"📁 Clean data exported to: {csv_file}")
    
    return total_expenses, len(all_transactions)

if __name__ == "__main__":
    try:
        total, count = main()
        print(f"\n✅ SUCCESS: ${total:,.2f} total expenses, {count:,} transactions processed")
    except Exception as e:
        print(f"\n❌ ERROR: {e}")
        import traceback
        traceback.print_exc()