require('dotenv').config();
const puppeteer = require('puppeteer');
const tldjs = require('tldjs');
const KeywordExtractor = require('keyword-extractor');
const Typesense = require('typesense');

class WebCrawler {
    constructor() {
        this.config = {
            botName: process.env.BOT_NAME || 'WebCrawlerBot/1.0',
            maxInnerLinks: parseInt(process.env.MAX_INNER_LINKS) || 10,
            maxRetries: parseInt(process.env.MAX_RETRIES) || 3,
            respectRobots: process.env.RESPECT_ROBOTS === 'true',
            requestDelay: parseInt(process.env.REQUEST_DELAY) || 1000,
            batchSize: parseInt(process.env.BATCH_SIZE) || 10,
            headless: process.env.HEADLESS === 'true',
            viewportWidth: parseInt(process.env.VIEWPORT_WIDTH) || 1920,
            viewportHeight: parseInt(process.env.VIEWPORT_HEIGHT) || 1080,
            userAgent: process.env.USER_AGENT,
            pageTimeout: parseInt(process.env.PAGE_TIMEOUT) || 30000,
            navigationTimeout: parseInt(process.env.NAVIGATION_TIMEOUT) || 30000
        };

        this.browser = null;
        this.results = [];
        
        // Initialize Typesense client
        this.typesenseClient = new Typesense.Client({
            'nodes': [{
                'host': process.env.TYPESENSE_HOST || 'localhost',
                'port': process.env.TYPESENSE_PORT || '8108',
                'protocol': process.env.TYPESENSE_PROTOCOL || 'http'
            }],
            'apiKey': process.env.TYPESENSE_API_KEY,
            'connectionTimeoutSeconds': parseInt(process.env.TYPESENSE_CONNECTION_TIMEOUT) || 5
        });
        
        // Sexual words for safe search detection
        this.sexualWords = [
            'sex', 'porn', 'xxx', 'adult', 'nude', 'naked', 'sexy', 'erotic',
            'fetish', 'hardcore', 'explicit', 'nsfw', 'mature', 'escort',
            'webcam', 'cam', 'strip', 'milf', 'amateur', 'boobs', 'ass',
            'dick', 'pussy', 'fuck', 'orgasm', 'masturbation', 'blowjob'
        ];
    }

    async initializeCollections() {
        try {
            console.log('Initializing Typesense collections...');

            // Define web_results collection schema - matching existing schema requirements
            const webResultsSchema = {
                'name': 'web_results',
                'fields': [
                    {'name': 'id', 'type': 'string'},
                    {'name': 'title', 'type': 'string'}, // Required field
                    {'name': 'description', 'type': 'string'}, // Required field
                    {'name': 'url', 'type': 'string'},
                    {'name': 'favicon', 'type': 'string', 'optional': true},
                    {'name': 'date', 'type': 'string'},
                    {'name': 'keywords', 'type': 'string[]'},
                    {'name': 'safesearch', 'type': 'bool'},
                    {'name': 'maindomain', 'type': 'bool'},
                    {'name': 'error', 'type': 'string', 'optional': true}
                ]
            };

            // Try to create web_results collection (websites collection should already exist)
            try {
                await this.typesenseClient.collections().create(webResultsSchema);
                console.log('✓ Created web_results collection');
            } catch (error) {
                if (error.message.includes('already exists')) {
                    console.log('✓ Web_results collection already exists');
                } else {
                    console.warn('Warning creating web_results collection:', error.message);
                }
            }

            console.log('Typesense collections initialized successfully');
        } catch (error) {
            console.error('Failed to initialize Typesense collections:', error.message);
            throw error;
        }
    }

    async getWebsitesToCrawl() {
        try {
            console.log(`Fetching websites to crawl...`);
            
            // Search for all websites with supportWebCrawl: true
            // Only get websites that are not 'complete'
            const searchParameters = {
                'q': '*',
                'query_by': 'title,url',
                'filter_by': 'supportWebCrawl:=true',
                'per_page': 250,
                'page': 1
            };

            console.log('Search parameters:', searchParameters);
            
            const searchResults = await this.typesenseClient.collections('websites').documents().search(searchParameters);
            
            console.log(`Found ${searchResults.hits.length} websites with supportWebCrawl enabled`);
            
            // Filter out completed websites in JavaScript
            const websitesToCrawl = searchResults.hits
                .map(hit => hit.document)
                .filter(website => {
                    // Include if webcrawl_status doesn't exist (never crawled) or if it's not 'complete'
                    return !website.webcrawl_status || website.webcrawl_status !== 'complete';
                })
                .slice(0, this.config.batchSize);

            console.log(`After filtering: ${websitesToCrawl.length} websites need crawling`);
            
            if (websitesToCrawl.length > 0) {
                console.log('\nWebsites to crawl:');
                websitesToCrawl.forEach((website, index) => {
                    console.log(`${index + 1}. ${website.title} (${website.url}) - Status: ${website.webcrawl_status || 'never crawled'}`);
                });
            }
            
            return websitesToCrawl;
        } catch (error) {
            console.error('Error fetching websites to crawl:', error.message);
            return [];
        }
    }

    async checkUrlExists(url) {
        try {
            const searchParameters = {
                'q': `"${url}"`,
                'query_by': 'url',
                'per_page': 1
            };

            const searchResults = await this.typesenseClient.collections('web_results').documents().search(searchParameters);
            return searchResults.hits.length > 0;
        } catch (error) {
            console.warn(`Error checking if URL exists: ${error.message}`);
            return false;
        }
    }

    async saveToWebResults(crawlResults, sourceWebsiteId) {
        try {
            console.log(`Saving ${crawlResults.length} results to web_results collection...`);
            
            const documentsToInsert = [];
            
            for (const result of crawlResults) {
                // Check if URL already exists to avoid duplicates
                const urlExists = await this.checkUrlExists(result.url);
                if (urlExists) {
                    console.log(`⚠️ Skipping duplicate URL: ${result.url}`);
                    continue;
                }

                // Prepare document with all required fields
                const document = {
                    id: `${sourceWebsiteId}_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`,
                    url: result.url,
                    title: result.title && result.title.trim() !== '' ? result.title.trim() : 'No Title',
                    description: result.description && result.description.trim() !== '' && result.description !== 'no description' 
                        ? result.description.trim() 
                        : 'no description',
                    date: new Date(result.crawledAt).toISOString(),
                    keywords: result.keywords || [],
                    safesearch: result.safeSearch === true,
                    maindomain: result.mainDomain === true
                };

                // Add optional fields only if they have actual values
                if (result.favicon && result.favicon.trim() !== '') {
                    document.favicon = result.favicon.trim();
                }

                if (result.error && result.error.trim() !== '') {
                    document.error = result.error.trim();
                }

                documentsToInsert.push(document);
                
                // Debug log the document structure
                console.log(`📄 Prepared document for ${result.url}:`, {
                    id: document.id,
                    url: document.url,
                    title: document.title,
                    description: document.description,
                    date: document.date,
                    keywordsCount: document.keywords.length,
                    safesearch: document.safesearch,
                    maindomain: document.maindomain,
                    hasError: !!document.error
                });
            }

            if (documentsToInsert.length > 0) {
                console.log(`Importing ${documentsToInsert.length} documents...`);
                
                // Import documents one by one to better handle errors
                let successCount = 0;
                let errorCount = 0;
                
                for (const doc of documentsToInsert) {
                    try {
                        await this.typesenseClient.collections('web_results').documents().create(doc);
                        successCount++;
                        console.log(`✅ Successfully saved: ${doc.url}`);
                    } catch (importError) {
                        errorCount++;
                        console.error(`❌ Failed to save ${doc.url}:`, importError.message);
                        
                        // Log the problematic document structure
                        console.error('📄 Problematic document:', JSON.stringify(doc, null, 2));
                        
                        // Try to save a minimal version with all required fields
                        try {
                            const minimalDoc = {
                                id: `minimal_${doc.id}`,
                                url: doc.url,
                                title: 'No Title',
                                description: 'no description',
                                date: new Date().toISOString(),
                                keywords: [],
                                safesearch: true,
                                maindomain: false
                            };
                            
                            await this.typesenseClient.collections('web_results').documents().create(minimalDoc);
                            console.log(`✅ Saved minimal version for: ${doc.url}`);
                            successCount++;
                        } catch (minimalError) {
                            console.error(`❌ Even minimal save failed for ${doc.url}:`, minimalError.message);
                            
                            // Try with timestamp as string for date
                            try {
                                const timestampDoc = {
                                    id: `timestamp_${doc.id}`,
                                    url: doc.url,
                                    title: 'No Title',
                                    description: 'no description',
                                    date: Math.floor(Date.now() / 1000).toString(),
                                    keywords: [],
                                    safesearch: true,
                                    maindomain: false
                                };
                                
                                await this.typesenseClient.collections('web_results').documents().create(timestampDoc);
                                console.log(`✅ Saved with timestamp format for: ${doc.url}`);
                                successCount++;
                            } catch (timestampError) {
                                console.error(`❌ All save attempts failed for ${doc.url}:`, timestampError.message);
                            }
                        }
                    }
                }
                
                console.log(`✓ Import completed: ${successCount} successful, ${errorCount} failed`);
            } else {
                console.log('ℹ️ No new documents to save (all were duplicates)');
            }
        } catch (error) {
            console.error('Error saving to web_results:', error.message);
            
            // Log detailed error information if available
            if (error.importResults) {
                console.error('📋 Import error details:');
                error.importResults.forEach((result, index) => {
                    if (!result.success) {
                        console.error(`Document ${index + 1} failed:`, result.error);
                    }
                });
            }
            
            throw error;
        }
    }

    async updateWebsiteStatus(websiteId, crawlStatus = 'complete') {
        try {
            console.log(`🔄 Updating website ${websiteId} crawl status to: ${crawlStatus}`);
            
            // First, update the webcrawl_status
            const updateDocument = {
                webcrawl_status: crawlStatus
            };

            await this.typesenseClient.collections('websites').documents(websiteId).update(updateDocument);
            console.log(`✓ Updated website ${websiteId} webcrawl_status to: ${crawlStatus}`);
            
            // Now check if both webcrawl_status and imagecrawl_status are complete
            if (crawlStatus === 'complete') {
                try {
                    // Retrieve the updated document to check both statuses
                    const document = await this.typesenseClient.collections('websites').documents(websiteId).retrieve();
                    
                    console.log(`🔍 Checking overall status for ${websiteId}:`, {
                        webcrawl_status: document.webcrawl_status,
                        imagecrawl_status: document.imagecrawl_status || 'not set'
                    });
                    
                    // Check if both crawl types are complete
                    if (document.webcrawl_status === 'complete' && document.imagecrawl_status === 'complete') {
                        // Update the overall status to approved
                        await this.typesenseClient.collections('websites').documents(websiteId).update({
                            status: 'approved'
                        });
                        
                        console.log(`🎉 Updated website ${websiteId} overall status to: APPROVED (both crawls complete)`);
                    } else {
                        console.log(`⏳ Website ${websiteId} not fully approved yet:`, {
                            webcrawl_status: document.webcrawl_status,
                            imagecrawl_status: document.imagecrawl_status || 'pending',
                            overall_status: 'pending - waiting for other crawl type'
                        });
                        
                        // Set status to partial if only one type is complete
                        await this.typesenseClient.collections('websites').documents(websiteId).update({
                            status: 'partial'
                        });
                        
                        console.log(`📝 Updated website ${websiteId} overall status to: PARTIAL`);
                    }
                    
                } catch (retrieveError) {
                    console.error(`❌ Could not retrieve document ${websiteId} to check overall status:`, retrieveError.message);
                }
            }
            
        } catch (error) {
            console.error(`❌ Error updating website ${websiteId} status:`, error.message);
            
            // Try a simpler update if the complex one fails
            try {
                console.log(`🔄 Attempting simple status update for ${websiteId}...`);
                await this.typesenseClient.collections('websites').documents(websiteId).update({
                    webcrawl_status: crawlStatus
                });
                console.log(`✓ Simple update successful for ${websiteId}`);
            } catch (simpleError) {
                console.error(`❌ Simple update also failed for ${websiteId}:`, simpleError.message);
            }
        }
    }

    async initBrowser() {
        try {
            console.log('Initializing Chrome browser...');
            
            const browserOptions = {
                headless: this.config.headless,
                args: [
                    '--no-sandbox',
                    '--disable-setuid-sandbox',
                    '--disable-dev-shm-usage',
                    '--disable-accelerated-2d-canvas',
                    '--no-first-run',
                    '--no-zygote',
                    '--disable-gpu',
                    '--disable-web-security',
                    '--disable-features=VizDisplayCompositor'
                ]
            };

            this.browser = await puppeteer.launch(browserOptions);
            console.log('✓ Chrome browser initialized successfully');
        } catch (error) {
            console.error('❌ Failed to initialize Chrome browser:', error.message);
            throw error;
        }
    }

    async checkRobotsTxt(url) {
        if (!this.config.respectRobots) {
            return true;
        }

        try {
            const domain = new URL(url).origin;
            const robotsUrl = `${domain}/robots.txt`;
            const targetPath = new URL(url).pathname;
            
            const response = await fetch(robotsUrl, {
                method: 'GET',
                headers: {
                    'User-Agent': this.config.userAgent || this.config.botName
                },
                timeout: 5000
            });

            if (!response.ok) {
                return true;
            }

            const robotsContent = await response.text();
            return this.parseRobotsTxt(robotsContent, targetPath);

        } catch (error) {
            console.warn(`Could not check robots.txt for ${url}:`, error.message);
            return true;
        }
    }

    parseRobotsTxt(robotsContent, targetPath) {
        const lines = robotsContent.split('\n').map(line => line.trim());
        const botNameLower = this.config.botName.toLowerCase();
        
        let currentUserAgent = '';
        let isRelevantSection = false;
        let isAllowed = true;

        for (const line of lines) {
            if (line === '' || line.startsWith('#')) {
                continue;
            }

            const lineLower = line.toLowerCase();

            if (lineLower.startsWith('user-agent:')) {
                currentUserAgent = lineLower.replace('user-agent:', '').trim();
                isRelevantSection = currentUserAgent === '*' || 
                                 currentUserAgent === botNameLower ||
                                 currentUserAgent.includes('webcrawlerbot');
                continue;
            }

            if (isRelevantSection) {
                if (lineLower.startsWith('disallow:')) {
                    const disallowPath = line.substring(line.indexOf(':') + 1).trim();
                    
                    if (disallowPath === '') {
                        continue;
                    }
                    
                    if (disallowPath === '/' || targetPath.startsWith(disallowPath)) {
                        return false;
                    }
                } else if (lineLower.startsWith('allow:')) {
                    const allowPath = line.substring(line.indexOf(':') + 1).trim();
                    
                    if (allowPath === '/' || targetPath.startsWith(allowPath)) {
                        isAllowed = true;
                    }
                }
            }
        }

        return isAllowed;
    }

    extractKeywords(text, limit = 10) {
        if (!text || text.trim() === '') return [];

        try {
            const keywords = KeywordExtractor.extract(text, {
                language: 'english',
                remove_digits: true,
                return_changed_case: true,
                remove_duplicates: true
            });

            return keywords.slice(0, limit);
        } catch (error) {
            console.warn('Keyword extraction failed:', error.message);
            return [];
        }
    }

    detectSafeSearch(text) {
        if (!text) return true;
        
        const lowerText = text.toLowerCase();
        return !this.sexualWords.some(word => lowerText.includes(word));
    }

    isMainDomain(url) {
        try {
            const parsedUrl = tldjs.parse(url);
            const urlObj = new URL(url);
            
            const isSubdomain = parsedUrl.subdomain && parsedUrl.subdomain !== '';
            const isRootPath = urlObj.pathname === '/' || urlObj.pathname === '';
            
            return !isSubdomain && isRootPath;
        } catch (error) {
            console.warn(`Could not parse URL ${url}:`, error.message);
            return false;
        }
    }

    extractFavicon(url, pageData) {
        try {
            const domain = new URL(url).origin;
            
            if (pageData.favicon) {
                if (pageData.favicon.startsWith('//')) {
                    return `https:${pageData.favicon}`;
                } else if (pageData.favicon.startsWith('/')) {
                    return `${domain}${pageData.favicon}`;
                } else if (pageData.favicon.startsWith('http')) {
                    return pageData.favicon;
                } else {
                    return `${domain}/${pageData.favicon}`;
                }
            }
            
            return `${domain}/favicon.ico`;
        } catch (error) {
            console.warn(`Could not extract favicon for ${url}:`, error.message);
            return '';
        }
    }

    async crawlPage(url, retryCount = 0) {
        let page = null;
        
        try {
            console.log(`🔍 Crawling: ${url} (Attempt ${retryCount + 1})`);

            const robotsAllowed = await this.checkRobotsTxt(url);
            if (!robotsAllowed) {
                console.log(`❌ Robots.txt disallows crawling: ${url}`);
                return {
                    url,
                    error: 'Disallowed by robots.txt',
                    title: null,
                    description: null,
                    keywords: [],
                    safeSearch: true,
                    mainDomain: this.isMainDomain(url),
                    favicon: '',
                    crawledAt: new Date().toISOString()
                };
            }

            page = await this.browser.newPage();

            if (this.config.userAgent) {
                await page.setUserAgent(this.config.userAgent);
            }

            await page.setViewport({
                width: this.config.viewportWidth,
                height: this.config.viewportHeight
            });

            page.setDefaultNavigationTimeout(this.config.navigationTimeout);
            page.setDefaultTimeout(this.config.pageTimeout);

            await page.goto(url, { 
                waitUntil: 'networkidle2',
                timeout: this.config.navigationTimeout 
            });

            const pageData = await page.evaluate(() => {
                const getMetaContent = (name) => {
                    const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"], meta[property="og:${name}"]`);
                    return meta ? meta.getAttribute('content') : null;
                };

                const getFavicon = () => {
                    let favicon = document.querySelector('link[rel="icon"]')?.href ||
                                 document.querySelector('link[rel="shortcut icon"]')?.href ||
                                 document.querySelector('link[rel="apple-touch-icon"]')?.href ||
                                 document.querySelector('link[rel="favicon"]')?.href;
                    
                    return favicon || null;
                };

                return {
                    title: document.title || null,
                    description: getMetaContent('description') || null,
                    keywords: getMetaContent('keywords') || null,
                    favicon: getFavicon(),
                    bodyText: document.body ? document.body.innerText.slice(0, 1000) : ''
                };
            });

            let keywords = [];
            if (pageData.keywords) {
                keywords = pageData.keywords.split(',').map(k => k.trim()).filter(k => k);
            } else if (pageData.description) {
                keywords = this.extractKeywords(pageData.description);
            } else {
                keywords = this.extractKeywords(pageData.bodyText);
            }

            let description = pageData.description;
            if (!description && pageData.bodyText) {
                description = pageData.bodyText.slice(0, 160) + '...';
            }
            if (!description) {
                description = 'no description';
            }

            const safeSearchText = `${pageData.title || ''} ${description} ${keywords.join(' ')}`;
            const safeSearch = this.detectSafeSearch(safeSearchText);

            const result = {
                url,
                title: pageData.title,
                description,
                keywords,
                safeSearch,
                mainDomain: this.isMainDomain(url),
                favicon: this.extractFavicon(url, pageData),
                crawledAt: new Date().toISOString()
            };

            console.log(`✅ Successfully crawled: ${url}`);
            return result;

        } catch (error) {
            console.error(`❌ Error crawling ${url}:`, error.message);

            if (retryCount < this.config.maxRetries - 1) {
                console.log(`🔄 Retrying ${url} in ${this.config.requestDelay}ms...`);
                await this.delay(this.config.requestDelay);
                return this.crawlPage(url, retryCount + 1);
            } else {
                return {
                    url,
                    error: error.message,
                    title: null,
                    description: null,
                    keywords: [],
                    safeSearch: true,
                    mainDomain: this.isMainDomain(url),
                    favicon: '',
                    crawledAt: new Date().toISOString()
                };
            }
        } finally {
            if (page) {
                await page.close();
            }
        }
    }

    async findInnerLinks(url) {
        let page = null;
        
        try {
            page = await this.browser.newPage();
            await page.goto(url, { waitUntil: 'networkidle2' });

            const links = await page.evaluate(() => {
                const anchors = Array.from(document.querySelectorAll('a[href]'));
                return anchors
                    .map(anchor => anchor.href)
                    .filter(href => href.startsWith('http'))
                    .filter((href, index, array) => array.indexOf(href) === index);
            });

            return links.slice(0, this.config.maxInnerLinks);
        } catch (error) {
            console.warn(`Could not extract inner links from ${url}:`, error.message);
            return [];
        } finally {
            if (page) {
                await page.close();
            }
        }
    }

    delay(ms) {
        return new Promise(resolve => setTimeout(resolve, ms));
    }

    async crawlWebsite(website) {
        const results = [];
        
        try {
            console.log(`\n🌐 Starting crawl for: ${website.title} (${website.url})`);
            
            const result = await this.crawlPage(website.url);
            results.push(result);

            if (!result.error && this.config.maxInnerLinks > 0) {
                console.log(`🔗 Finding inner links for: ${website.url}`);
                const innerLinks = await this.findInnerLinks(website.url);
                
                console.log(`Found ${innerLinks.length} inner links`);
                
                for (const innerLink of innerLinks) {
                    const innerResult = await this.crawlPage(innerLink);
                    results.push(innerResult);
                    await this.delay(this.config.requestDelay);
                }
            }

            return results;
        } catch (error) {
            console.error(`❌ Failed to crawl website ${website.url}:`, error.message);
            return [{
                url: website.url,
                error: error.message,
                title: null,
                description: null,
                keywords: [],
                safeSearch: true,
                mainDomain: this.isMainDomain(website.url),
                favicon: '',
                crawledAt: new Date().toISOString()
            }];
        }
    }

    async runCrawlLoop() {
        try {
            await this.initializeCollections();
            await this.initBrowser();

            let processedBatches = 0;
            let totalProcessed = 0;

            while (true) {
                console.log('\n' + '='.repeat(60));
                console.log(`🚀 Starting batch ${processedBatches + 1}...`);
                console.log('='.repeat(60));

                const websitesToCrawl = await this.getWebsitesToCrawl();
                
                if (websitesToCrawl.length === 0) {
                    console.log('✅ No more websites to crawl. Exiting...');
                    break;
                }

                for (const website of websitesToCrawl) {
                    try {
                        console.log(`\n📝 Processing website: ${website.title} (${website.url})`);
                        
                        const crawlResults = await this.crawlWebsite(website);
                        
                        await this.saveToWebResults(crawlResults, website.id);
                        
                        await this.updateWebsiteStatus(website.id, 'complete');
                        
                        totalProcessed++;
                        console.log(`✅ Completed processing website: ${website.title}`);
                        
                        await this.delay(this.config.requestDelay);
                        
                    } catch (error) {
                        console.error(`❌ Failed to process website ${website.url}:`, error.message);
                        
                        try {
                            await this.updateWebsiteStatus(website.id, 'failed');
                        } catch (updateError) {
                            console.error(`❌ Failed to update status for ${website.id}:`, updateError.message);
                        }
                    }
                }

                processedBatches++;
                console.log(`\n✅ Completed batch ${processedBatches}. Total websites processed: ${totalProcessed}`);
                
                await this.delay(2000);
            }

            console.log('\n' + '='.repeat(60));
            console.log('🎉 CRAWL SUMMARY');
            console.log('='.repeat(60));
            console.log(`Total batches processed: ${processedBatches}`);
            console.log(`Total websites processed: ${totalProcessed}`);
            console.log('All websites have been crawled!');

        } catch (error) {
            console.error('❌ Crawl loop failed:', error.message);
            throw error;
        } finally {
            if (this.browser) {
                await this.browser.close();
                console.log('🔒 Browser closed.');
            }
        }
    }
}

// Main execution
async function main() {
    const crawler = new WebCrawler();
    
    try {
        await crawler.runCrawlLoop();
    } catch (error) {
        console.error('❌ Crawler execution failed:', error.message);
        process.exit(1);
    }
}

// Run the crawler
if (require.main === module) {
    main().catch(console.error);
}

module.exports = WebCrawler;