process_remaining.py

import os
import gc
import json
import time
import logging
import sqlite3
import psutil
import pdfplumber
import requests
import tempfile
import re
from typing import List, Dict, Any, Tuple, Optional

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('processing.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('arxiv_processor')

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
    return memory_gb

def save_checkpoint(current_index: int):
    """Save progress to a checkpoint file"""
    checkpoint = {
        'paper_index': current_index,
        'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
    }
    with open('data/local_checkpoint.json', 'w') as f:
        json.dump(checkpoint, f)
    logger.info(f"Saved checkpoint: Paper {current_index}")

def load_checkpoint():
    """Load progress from checkpoint file"""
    try:
        with open('data/local_checkpoint.json', 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return None

def get_processing_stats():
    """Get current processing statistics"""
    conn = sqlite3.connect('data/papers.db')
    cursor = conn.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM papers")
    total = cursor.fetchone()[0]
    
    cursor.execute("SELECT COUNT(*) FROM papers WHERE processed = 1")
    processed = cursor.fetchone()[0]
    
    cursor.execute("SELECT COUNT(*) FROM papers WHERE processed = -1")
    failed = cursor.fetchone()[0]
    
    cursor.execute("SELECT COUNT(*) FROM papers WHERE emails != ''")
    with_emails = cursor.fetchone()[0]
    
    conn.close()
    
    return {
        'total': total,
        'processed': processed,
        'failed': failed,
        'with_emails': with_emails,
        'remaining': total - processed - failed
    }

def download_and_extract_text(pdf_url: str) -> Tuple[bool, Optional[str]]:
    """Download a PDF and extract its text content"""
    try:
        # Download the PDF
        response = requests.get(pdf_url, timeout=30)  # Added timeout
        if response.status_code != 200:
            return False, None
        
        # Create a temporary file
        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_pdf:
            temp_pdf.write(response.content)
            temp_pdf_path = temp_pdf.name
        
        try:
            # Extract text from the PDF
            text = ""
            with pdfplumber.open(temp_pdf_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() or ""
                    gc.collect()  # Clean up after each page
            
            return True, text
            
        finally:
            # Clean up the temporary file
            os.unlink(temp_pdf_path)
            
    except Exception as e:
        logger.error(f"Error processing PDF: {str(e)}")
        return False, None

def extract_and_clean_emails(text: str) -> List[str]:
    """Extract and clean email addresses from text"""
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, text)
    
    cleaned_emails = []
    for email in emails:
        email = email.lower()
        email = re.sub(r'[.,;]$', '', email)
        if email not in cleaned_emails:
            cleaned_emails.append(email)
    
    return cleaned_emails

def update_paper_status(arxiv_id: str, emails: List[str] = None, failed: bool = False, retry_after: str = None):
    """Update the paper's status in the database"""
    conn = sqlite3.connect('data/papers.db')
    cursor = conn.cursor()
    
    if failed:
        # Mark as failed but allow retry later
        cursor.execute("""
        UPDATE papers 
        SET processed = -1, 
            last_attempt = CURRENT_TIMESTAMP,
            retry_after = ?
        WHERE arxiv_id = ?
        """, (retry_after, arxiv_id))
    else:
        # Successfully processed
        cursor.execute("""
        UPDATE papers 
        SET emails = ?, 
            processed = 1,
            last_attempt = CURRENT_TIMESTAMP,
            retry_after = NULL
        WHERE arxiv_id = ?
        """, (', '.join(emails) if emails else '', arxiv_id))
    
    conn.commit()
    conn.close()

def get_unprocessed_papers(batch_size: int = 10) -> List[Dict[str, Any]]:
    """Get a batch of unprocessed papers, prioritizing older papers"""
    conn = sqlite3.connect('data/papers.db')
    cursor = conn.cursor()
    
    # First, ensure we have the necessary columns
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS papers_new (
        arxiv_id TEXT PRIMARY KEY,
        title TEXT,
        authors TEXT,
        published_date TEXT,
        pdf_link TEXT,
        doi TEXT,
        abstract TEXT,
        emails TEXT,
        processed INTEGER DEFAULT 0,
        last_attempt TIMESTAMP,
        retry_after TIMESTAMP
    )
    """)
    
    # Check if we need to migrate data
    cursor.execute("PRAGMA table_info(papers)")
    columns = [col[1] for col in cursor.fetchall()]
    
    if 'last_attempt' not in columns:
        # Backup original table
        cursor.execute("CREATE TABLE IF NOT EXISTS papers_backup AS SELECT * FROM papers")
        # Copy data to new table
        cursor.execute("""
        INSERT INTO papers_new (
            arxiv_id, title, authors, published_date, pdf_link, doi, 
            abstract, emails, processed
        ) SELECT * FROM papers
        """)
        # Replace old table with new one
        cursor.execute("DROP TABLE papers")
        cursor.execute("ALTER TABLE papers_new RENAME TO papers")
        conn.commit()
    
    # Get papers that:
    # 1. Haven't been processed (processed = 0)
    # 2. Or failed but are ready for retry (processed = -1 AND retry_after < CURRENT_TIMESTAMP)
    # Order by published_date to process older papers first
    cursor.execute('''
    SELECT * FROM papers 
    WHERE processed = 0 
    OR (processed = -1 AND (retry_after IS NULL OR retry_after < CURRENT_TIMESTAMP))
    ORDER BY published_date ASC
    LIMIT ?
    ''', (batch_size,))
    
    papers = []
    for row in cursor.fetchall():
        papers.append({
            'arxiv_id': row[0],
            'title': row[1],
            'authors': row[2].split(', '),
            'published_date': row[3],
            'pdf_link': row[4],
            'doi': row[5],
            'abstract': row[6],
            'emails': row[7].split(', ') if row[7] else [],
            'processed': row[8]
        })
    
    conn.close()
    return papers

def process_paper(paper: Dict[str, Any]) -> bool:
    """Process a single paper"""
    arxiv_id = paper['arxiv_id']
    pdf_link = paper['pdf_link']
    
    logger.info(f"Processing paper: {paper['title']} (ID: {arxiv_id})")
    
    try:
        # Force garbage collection before processing
        gc.collect()
        
        # Download and extract text from the PDF
        success, text = download_and_extract_text(pdf_link)
        
        if not success or not text:
            # Calculate retry time based on paper age
            pub_date = time.strptime(paper['published_date'][:10], "%Y-%m-%d")
            days_old = (time.time() - time.mktime(pub_date)) / (24 * 3600)
            
            if days_old < 7:
                retry_after = time.strftime("%Y-%m-%d %H:%M:%S", 
                    time.localtime(time.time() + 24 * 3600))  # retry in 24 hours
            elif days_old < 30:
                retry_after = time.strftime("%Y-%m-%d %H:%M:%S", 
                    time.localtime(time.time() + 12 * 3600))  # retry in 12 hours
            else:
                retry_after = time.strftime("%Y-%m-%d %H:%M:%S", 
                    time.localtime(time.time() + 6 * 3600))   # retry in 6 hours
            
            logger.warning(f"Failed to download/extract PDF for paper {arxiv_id}. Will retry after {retry_after}")
            update_paper_status(arxiv_id, failed=True, retry_after=retry_after)
            return False
        
        # Extract email addresses from the text
        emails = extract_and_clean_emails(text)
        
        # Clear the text variable to free memory
        text = None
        gc.collect()
        
        logger.info(f"Found {len(emails)} email addresses in paper {arxiv_id}")
        
        # Update the database with the extracted emails
        update_paper_status(arxiv_id, emails=emails)
        
        # Clear emails list to free memory
        emails = None
        gc.collect()
        
        return True
    
    except Exception as e:
        logger.error(f"Error processing paper {arxiv_id}: {str(e)}")
        # Set retry after 1 hour for general errors
        retry_after = time.strftime("%Y-%m-%d %H:%M:%S", 
            time.localtime(time.time() + 3600))
        update_paper_status(arxiv_id, failed=True, retry_after=retry_after)
        return False

def main():
    """Main processing function"""
    # Create data directory if it doesn't exist
    os.makedirs('data', exist_ok=True)
    
    # Load checkpoint if exists
    checkpoint = load_checkpoint()
    start_index = checkpoint['paper_index'] if checkpoint else 0
    
    # Get initial stats
    stats = get_processing_stats()
    logger.info(f"Starting processing. Total papers: {stats['total']}, "
                f"Already processed: {stats['processed']}, "
                f"Failed (waiting retry): {stats['failed']}, "
                f"With emails: {stats['with_emails']}, "
                f"Remaining: {stats['remaining']}")
    
    batch_size = 20  # Increased batch size
    successful = 0
    failed = 0
    consecutive_failures = 0  # Track consecutive failures for adaptive delays
    
    try:
        while True:
            # Get next batch of unprocessed papers
            papers = get_unprocessed_papers(batch_size)
            if not papers:
                logger.info("No more papers to process at this time.")
                # Sleep for a while before checking again
                time.sleep(60)  # Reduced from 300s to 60s
                continue
            
            for i, paper in enumerate(papers):
                # Check memory usage
                memory_usage = get_memory_usage()
                if memory_usage > 10:  # If using more than 10GB
                    logger.warning(f"High memory usage ({memory_usage:.2f}GB). Forcing garbage collection.")
                    gc.collect()
                    time.sleep(2)  # Reduced from 5s to 2s
                
                # Process the paper
                if process_paper(paper):
                    successful += 1
                    consecutive_failures = 0  # Reset failure counter on success
                else:
                    failed += 1
                    consecutive_failures += 1
                
                # Save checkpoint
                save_checkpoint(start_index + successful + failed)
                
                # Print progress
                if (successful + failed) % 10 == 0:
                    stats = get_processing_stats()
                    logger.info(f"Progress: Processed {successful + failed} papers "
                              f"(Success: {successful}, Failed: {failed}). "
                              f"Total with emails: {stats['with_emails']}, "
                              f"Awaiting retry: {stats['failed']}")
                
                # Adaptive delay between papers based on consecutive failures
                if consecutive_failures > 3:
                    time.sleep(10)  # Longer delay if many failures
                elif consecutive_failures > 0:
                    time.sleep(5)  # Medium delay if some failures
                else:
                    time.sleep(2)  # Short delay if successful
            
            # Force garbage collection after each batch
            gc.collect()
            time.sleep(2)  # Reduced from 5s to 2s
    
    except KeyboardInterrupt:
        logger.info("\nProcessing interrupted by user. Progress saved.")
    except Exception as e:
        logger.error(f"Error during processing: {str(e)}")
    finally:
        # Print final stats
        stats = get_processing_stats()
        logger.info(f"\nProcessing complete or interrupted.")
        logger.info(f"Final stats:")
        logger.info(f"- Total papers in database: {stats['total']}")
        logger.info(f"- Successfully processed: {stats['processed']}")
        logger.info(f"- Failed (awaiting retry): {stats['failed']}")
        logger.info(f"- Papers with emails: {stats['with_emails']}")
        logger.info(f"- This session: {successful} successful, {failed} failed")

if __name__ == "__main__":
    main()