#!/usr/bin/env python3

import csv
import pandas as pd
import re
from collections import defaultdict
from datetime import datetime
import os

def clean_amount(amount_str):
    """Convert amount string to float, handling various formats"""
    if not amount_str or amount_str == '':
        return 0.0
    
    # Remove commas and spaces, handle negative amounts
    amount_str = str(amount_str).replace(',', '').replace(' ', '').strip()
    
    # Handle empty or non-numeric values
    if amount_str == '' or amount_str == '-' or amount_str == 'nan':
        return 0.0
    
    try:
        return float(amount_str)
    except:
        return 0.0

def categorize_vendor(vendor_name, description=""):
    """Categorize vendors based on name and description"""
    vendor = vendor_name.lower()
    desc = description.lower()
    
    # Software/Technology
    if any(x in vendor for x in ['zapier', 'dropbox', 'docusign', 'netflix', 'microsoft', 'apple', 'linkedin', 'grammarly']):
        return 'Software/Technology', 'Atomic Elevator'
    
    # Marketing/Content
    if any(x in vendor for x in ['shutterstoc', 'canva', 'constant contact', 'mailchimp']):
        return 'Marketing/Content', 'Space Monkey Partners'
    
    # Entertainment/Personal
    if any(x in vendor for x in ['spotify', 'peacocktv', 'hulu', 'disney', 'netflix', 'paramount', 'dollarshave', 'headspace']):
        return 'Entertainment/Personal', 'Personal'
    
    # Travel/Accommodation
    if any(x in vendor for x in ['airbnb', 'uber', 'starbucks']):
        return 'Travel', 'Shared'
    
    # Insurance
    if any(x in vendor for x in ['insurance', 'state farm']):
        return 'Insurance', 'Shared'
    
    # Utilities/Infrastructure
    if any(x in vendor for x in ['spectrum', 'hostgator', 'networksolu']):
        return 'Office/Utilities', 'Shared'
    
    # Payroll/Employee
    if any(x in vendor for x in ['gusto', 'guideline']):
        return 'Payroll/Benefits', 'Shared'
    
    # Credit Card Payments (not expenses)
    if any(x in vendor for x in ['amex epayment', 'capital one', 'jpmorgan chase', 'applecard']):
        return 'Credit Card Payment', 'Transfer'
    
    # Business Revenue/Transfers
    if any(x in vendor for x in ['space monkey par', 'atomic elevator', 'cornerstone', 'deposit', 'mercury', 'red oxx']):
        return 'Business Revenue/Transfer', 'Transfer'
    
    # Medium subscription - could be business or personal
    if 'medium' in vendor:
        return 'Review Needed', 'Review Needed'
    
    # Samsung - could be business or personal
    if 'samsung' in vendor:
        return 'Review Needed', 'Review Needed'
    
    # Affirm payments
    if 'affirm' in vendor:
        return 'Financing', 'Review Needed'
    
    # Default
    return 'Other', 'Review Needed'

def process_bank_statement(csv_file):
    """Process the bank statement CSV file"""
    transactions = []
    
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        
        for row in reader:
            # Skip empty rows
            if not any(row.values()):
                continue
                
            # Extract key fields
            date_str = row.get('Date', '')
            debit = clean_amount(row.get('Debit', ''))
            credit = clean_amount(row.get('Credit', ''))
            description = row.get('Description', '')
            
            # Parse date
            try:
                date = datetime.strptime(date_str, '%m/%d/%Y')
            except:
                continue
            
            # Only process debits (expenses) for now
            if debit > 0:
                category, business = categorize_vendor(description, description)
                
                transactions.append({
                    'date': date,
                    'amount': debit,
                    'vendor': description,
                    'category': category,
                    'business': business,
                    'source': 'Bank Statement'
                })
    
    return transactions

def process_xlsx_files():
    """Process any Excel files that contain expense data"""
    # This would need to be customized based on the actual structure of the Excel files
    # For now, we'll return empty list and focus on the bank statement
    return []

def generate_summary_report(transactions):
    """Generate comprehensive expense summary"""
    
    # Filter out transfers and credit card payments
    expense_transactions = [t for t in transactions if t['category'] not in ['Credit Card Payment', 'Business Revenue/Transfer']]
    
    # Total expenses
    total_expenses = sum(t['amount'] for t in expense_transactions)
    
    # By category
    category_totals = defaultdict(float)
    for t in expense_transactions:
        category_totals[t['category']] += t['amount']
    
    # By business
    business_totals = defaultdict(float)
    for t in expense_transactions:
        business_totals[t['business']] += t['amount']
    
    # Generate report
    report = f"""
# COMPREHENSIVE EXPENSE ANALYSIS - 2025
## Keith Lauver Business Expenses

---

## 📊 FINANCIAL SUMMARY

**Total Business Expenses:** ${total_expenses:,.2f}

### BY CATEGORY:
"""
    
    for category, amount in sorted(category_totals.items(), key=lambda x: x[1], reverse=True):
        percentage = (amount / total_expenses) * 100 if total_expenses > 0 else 0
        report += f"- **{category}:** ${amount:,.2f} ({percentage:.1f}%)\n"
    
    report += f"\n### BY BUSINESS ENTITY:\n"
    
    for business, amount in sorted(business_totals.items(), key=lambda x: x[1], reverse=True):
        percentage = (amount / total_expenses) * 100 if total_expenses > 0 else 0
        report += f"- **{business}:** ${amount:,.2f} ({percentage:.1f}%)\n"
    
    # Top expenses by vendor
    vendor_totals = defaultdict(float)
    for t in expense_transactions:
        vendor_name = t['vendor'][:50] + "..." if len(t['vendor']) > 50 else t['vendor']
        vendor_totals[vendor_name] += t['amount']
    
    report += f"\n## 🔍 TOP EXPENSES BY VENDOR\n\n"
    
    for vendor, amount in sorted(vendor_totals.items(), key=lambda x: x[1], reverse=True)[:15]:
        report += f"**${amount:,.2f}** - {vendor}\n\n"
    
    # Items requiring review
    review_items = [t for t in expense_transactions if t['business'] == 'Review Needed']
    if review_items:
        review_total = sum(t['amount'] for t in review_items)
        report += f"\n## ⚠️ ITEMS REQUIRING CATEGORIZATION\n\n"
        report += f"**Total requiring review:** ${review_total:,.2f}\n\n"
        
        review_vendors = defaultdict(float)
        for t in review_items:
            review_vendors[t['vendor']] += t['amount']
        
        for vendor, amount in sorted(review_vendors.items(), key=lambda x: x[1], reverse=True):
            report += f"**${amount:,.2f}** - {vendor}\n\n"
    
    # Monthly breakdown
    monthly_totals = defaultdict(float)
    for t in expense_transactions:
        month_key = t['date'].strftime('%Y-%m')
        monthly_totals[month_key] += t['amount']
    
    report += f"\n## 📅 MONTHLY EXPENSE BREAKDOWN\n\n"
    
    for month, amount in sorted(monthly_totals.items()):
        month_name = datetime.strptime(month, '%Y-%m').strftime('%B %Y')
        report += f"**{month_name}:** ${amount:,.2f}\n\n"
    
    return report

def main():
    # Process bank statement
    bank_file = '/home/ubuntu/.openclaw/media/inbound/file_293---5c15fc2d-5f20-4d35-972b-6da60ad73643.csv'
    
    print("Processing bank statement...")
    transactions = process_bank_statement(bank_file)
    
    print(f"Processed {len(transactions)} transactions")
    
    # Generate report
    report = generate_summary_report(transactions)
    
    # Save report
    with open('Complete_Expense_Analysis_2025.md', 'w') as f:
        f.write(report)
    
    print("Analysis complete! Report saved to Complete_Expense_Analysis_2025.md")
    
    # Create CSV export for further analysis
    df = pd.DataFrame(transactions)
    df.to_csv('All_Expenses_2025_Complete.csv', index=False)
    
    print("CSV export saved to All_Expenses_2025_Complete.csv")

if __name__ == "__main__":
    main()