#!/usr/bin/env python3
"""
Script to download all PDFs from asset.quant-wiki.com links found in the quant-wiki documentation.
This script will:
1. Parse the grep output to extract PDF URLs and their corresponding file paths
2. Download each PDF to the appropriate directory
3. Handle errors and retries for failed downloads
"""

import os
import re
import requests
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Configuration
MAX_WORKERS = 5  # Number of concurrent downloads
RETRY_ATTEMPTS = 3  # Number of retry attempts for failed downloads
REQUEST_TIMEOUT = 30  # Request timeout in seconds

# Headers to mimic a browser request
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def extract_pdf_links_from_grep_output():
    """
    Extract PDF URLs and their corresponding file paths from the grep output.
    This function reads the actual grep output from the terminal command.
    """
    pdf_links = []
    
    # Run grep command to get all PDF links
    import subprocess
    
    try:
        result = subprocess.run(
            ['grep', '-nr', 'asset.quant-wiki.com', '/home/quant_volumn/quant_common/quant-wiki/docs/'],
            capture_output=True, text=True, timeout=300
        )
        
        if result.returncode == 0:
            grep_output = result.stdout
        else:
            print(f"Grep command failed with return code {result.returncode}")
            print(f"Error: {result.stderr}")
            return pdf_links
            
    except subprocess.TimeoutExpired:
        print("Grep command timed out")
        return pdf_links
    except Exception as e:
        print(f"Error running grep command: {e}")
        return pdf_links
    
    # Parse the grep output to extract file paths and links between parentheses
    lines = grep_output.strip().split('\n')

    for line in lines:
        if 'asset.quant-wiki.com' in line:
            # Extract the file path (everything before the first colon)
            file_path_match = re.match(r'^(.*?):', line)
            if not file_path_match:
                continue
            file_path = file_path_match.group(1)

            # Extract PDF URLs robustly: allow parentheses inside titles by matching up to '.pdf'
            # This captures full URLs like:
            # (https://asset.quant-wiki.com/pdf/Quantitative Trading_ How to Build Your Own Algorithmic Trading Business (Wiley Trading) (2021, Wiley).pdf)
            urls = re.findall(r'https?://\S*?\.pdf', line, flags=re.IGNORECASE)
            for url in urls:
                # Clean up trailing markdown chars if any
                clean_link = re.sub(r'[\]\)\s]+$', '', url)
                pdf_links.append((file_path, clean_link))
    
    return pdf_links

def check_pdf_exists(output_path):
    """
    Check if PDF already exists in the target directory.
    Returns True if PDF exists, False otherwise.
    """
    if os.path.exists(output_path):
        return True
    
    # Also check for PDF files with similar names in the same directory
    pdf_dir = os.path.dirname(output_path)
    if os.path.exists(pdf_dir):
        for file in os.listdir(pdf_dir):
            if file.lower().endswith('.pdf'):
                # If there's already a PDF file in the directory, skip download
                return True
    
    return False

def download_pdf(url, output_path, retry_count=RETRY_ATTEMPTS):
    """
    Download a PDF from the given URL and save it to the output path.
    """
    # Check if PDF already exists
    if check_pdf_exists(output_path):
        print(f"✓ PDF already exists: {output_path}")
        return True
    
    for attempt in range(retry_count):
        try:
            response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, stream=True)
            response.raise_for_status()
            
            # Create directory if it doesn't exist
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            
            # Save the PDF
            with open(output_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            
            print(f"✓ Downloaded: {output_path}")
            return True
            
        except requests.exceptions.RequestException as e:
            print(f"✗ Attempt {attempt + 1} failed for {url}: {e} {output_path}")
            if attempt < retry_count - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                print(f"✗ Failed to download {url} after {retry_count} attempts")
                return False
    
    return False


def folder_has_any_pdf(directory):
    """
    Return True if the given directory contains any .pdf file (case-insensitive).
    """
    try:
        for entry in os.listdir(directory):
            if entry.lower().endswith('.pdf'):
                return True
    except FileNotFoundError:
        return False
    except Exception:
        return False

    return False

def get_pdf_filename_from_url(url):
    """
    Extract the PDF filename from the URL.
    """
    # Decode URL-encoded characters
    decoded_url = urllib.parse.unquote(url)
    
    # Extract the filename part after the last slash
    filename = decoded_url.split('/')[-1]
    
    # Ensure it has a .pdf extension
    if not filename.lower().endswith('.pdf'):
        filename += '.pdf'
    
    return filename

def main():
    print("Starting PDF download process...")
    
    # Extract PDF links from grep output
    pdf_links = extract_pdf_links_from_grep_output()
    
    if not pdf_links:
        print("No PDF links found in grep output.")
        return
    
    print(f"Found {len(pdf_links)} PDF links to download.")
    
    # Prepare download tasks
    download_tasks = []
    
    for file_path, pdf_url in pdf_links:
        # Get the directory of the markdown file
        md_dir = os.path.dirname(file_path)
        
        # Get PDF filename from URL
        pdf_filename = get_pdf_filename_from_url(pdf_url)
        
        # Create output path in the same directory as the markdown file
        output_path = os.path.join(md_dir, pdf_filename)
        # Skip downloading if the target directory already contains any PDF
        if folder_has_any_pdf(md_dir):
            print(f"Skipping '{md_dir}' — folder already contains a PDF.")
            continue

        download_tasks.append((pdf_url, output_path))
    
    # Download PDFs concurrently
    successful_downloads = 0
    failed_downloads = 0
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all download tasks
        future_to_task = {
            executor.submit(download_pdf, url, output_path): (url, output_path)
            for url, output_path in download_tasks
        }
        
        # Process completed tasks
        for future in as_completed(future_to_task):
            url, output_path = future_to_task[future]
            try:
                success = future.result()
                if success:
                    successful_downloads += 1
                else:
                    failed_downloads += 1
            except Exception as e:
                print(f"✗ Unexpected error downloading {url}: {e}, {output_path}")
                failed_downloads += 1
    
    print("\nDownload summary:")
    print(f"Successful downloads: {successful_downloads}")
    print(f"Failed downloads: {failed_downloads}")
    print(f"Total attempted: {len(pdf_links)}")

if __name__ == "__main__":
    main()