#!/usr/bin/env python3
import re
import os
from typing import Tuple, List

def print_banner():
    """Print a nice banner for the application"""
    print("=" * 60)
    print("          ? URL CLEANER TOOL ?")
    print("    Remove URLs and keep Email:Pass or Username:Pass")
    print("=" * 60)
    print()

def print_separator():
    """Print a separator line"""
    print("-" * 60)

def clean_line(line: str) -> str:
    """
    Clean a line by removing URLs and keeping only email:password or username:password
    """
    line = line.strip()
    if not line:
        return ""
    
    # Split by colon
    parts = line.split(':')
    if len(parts) < 2:
        return ""
    
    # Look for email or username pattern in the parts
    for i in range(len(parts) - 1):
        potential_user = parts[i]
        potential_password = ':'.join(parts[i+1:])
        
        # Skip parts that look like URL components
        if any(url_part in potential_user.lower() for url_part in [
            'http', 'https', 'www.', '.com/', '.net/', '.org/', 
            'quillbot.com', 'legacy-word', 'app.', 'auth.', 
            'mobile.', 'signup', 'login', 'recovery', 'settings',
            'upgrade', 'grammar-check', 'extension'
        ]):
            continue
            
        # Skip parts that start with // or contain URL patterns
        if potential_user.startswith('//') or potential_user.startswith('/'):
            continue
            
        # Check if it's a valid email or username
        if ('@' in potential_user and len(potential_user) > 3) or (
            len(potential_user) > 2 and 
            not any(char in potential_user for char in ['/', '\\', 'http']) and
            potential_password and len(potential_password) > 0
        ):
            return f"{potential_user}:{potential_password}"
    
    return ""

def is_valid_credential(line: str) -> bool:
    """
    Check if a line contains valid email:password or username:password format
    """
    if not line or ':' not in line:
        return False
    
    parts = line.split(':', 1)
    if len(parts) != 2:
        return False
    
    username_part, password_part = parts
    
    # Basic validation
    if not username_part or not password_part:
        return False
    
    # Remove common unwanted patterns
    unwanted_patterns = [
        'http', 'https', 'www.', '.com/', '.net/', '.org/',
        'login', 'signup', 'recovery', 'settings', 'upgrade'
    ]
    
    for pattern in unwanted_patterns:
        if pattern in username_part.lower():
            return False
    
    return True

def categorize_credential(line: str) -> str:
    """
    Categorize if the credential is email:pass or username:pass
    """
    if ':' not in line:
        return "unknown"
    
    username_part = line.split(':', 1)[0]
    
    if '@' in username_part:
        return "email"
    else:
        return "username"

def process_file(filename: str) -> Tuple[int, int, int, int, List[str]]:
    """
    Process the file and return statistics
    Returns: (total_lines, cleaned_lines, email_count, username_count, cleaned_data)
    """
    if not os.path.exists(filename):
        raise FileNotFoundError(f"File '{filename}' not found!")
    
    print(f"? Processing file: {filename}")
    print("⏳ Cleaning URLs and processing...")
    
    cleaned_data = []
    total_lines = 0
    
    with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
        for line_num, line in enumerate(file, 1):
            total_lines += 1
            
            # Show progress for large files (overwrite same line)
            if line_num % 10000 == 0:
                print(f"\r   ⏳ Processing line {line_num:,}...", end='', flush=True)
            
            cleaned_line = clean_line(line)
            
            if cleaned_line and is_valid_credential(cleaned_line):
                cleaned_data.append(cleaned_line)
    
    # Clear progress line
    print("\r" + " " * 50 + "\r", end='')
    
    # Count email vs username credentials
    email_count = 0
    username_count = 0
    
    for line in cleaned_data:
        if categorize_credential(line) == "email":
            email_count += 1
        else:
            username_count += 1
    
    return total_lines, len(cleaned_data), email_count, username_count, cleaned_data

def save_cleaned_file(filename: str, cleaned_data: List[str]):
    """
    Save cleaned data to the same input file (overwrite original)
    """
    with open(filename, 'w', encoding='utf-8') as file:
        for line in cleaned_data:
            file.write(line + '\n')
    
    print(f"? Cleaned data saved back to: {filename}")

def print_statistics(total_lines: int, cleaned_lines: int, email_count: int, username_count: int):
    """
    Print detailed statistics with nice formatting
    """
    print_separator()
    print("? CLEANING SUMMARY")
    print_separator()
    
    print(f"? Total lines processed:     {total_lines:,}")
    print(f"?️  Lines removed:            {(total_lines - cleaned_lines):,}")
    print(f"✅ Lines remaining:           {cleaned_lines:,}")
    
    print()
    print("? CREDENTIAL BREAKDOWN:")
    print(f"? Email:Password entries:   {email_count:,}")
    print(f"? Username:Password entries: {username_count:,}")
    print(f"? Total clean entries:      {cleaned_lines:,}")
    
    if total_lines > 0:
        retention_rate = (cleaned_lines / total_lines) * 100
        print(f"? Data retention rate:      {retention_rate:.1f}%")

def main():
    """
    Main function to run the URL cleaner
    """
    print_banner()
    
    try:
        # Get filename from user
        filename = input("? Enter filename to clean: ").strip()
        
        if not filename:
            print("❌ No filename provided!")
            return
        
        print()
        print_separator()
        
        # Process the file
        total_lines, cleaned_lines, email_count, username_count, cleaned_data = process_file(filename)
        
        # Save cleaned file
        save_cleaned_file(filename, cleaned_data)
        
        # Print statistics
        print_statistics(total_lines, cleaned_lines, email_count, username_count)
        
        print_separator()
        print("? Process completed successfully!")
        print("=" * 60)
        
    except FileNotFoundError as e:
        print(f"❌ Error: {e}")
    except KeyboardInterrupt:
        print("\n❌ Process interrupted by user!")
    except Exception as e:
        print(f"❌ Unexpected error: {e}")

if __name__ == "__main__":
    main()