Showing posts with label Node.js. Show all posts
Showing posts with label Node.js. Show all posts

Sunday, 3 August 2025

Crawl Page And Capture Screenshot PDF


This function does :

  • Full-Page PDF Generation
  • Recursive Internal Link Crawling
  • Content and Metadata Extraction
  • Intelligent Overlay & Popup Handling
  • Auto Scroll to Load Dynamic Content
  • Persistent File & DB Storage
  • User and Context-Aware
  • Timeout and Error Handling

private async crawlPage(
        browser: Browser,
        url: string,
        user: User,
        depth: number,
        visited: Set<string>,
        parent: any,
    ) {
        if (visited.has(url) || depth < 0) return;
        visited.add(url);

        this.logger.log(`🔍 Crawling URL: ${url} (Depth: ${depth})`);

        const existingEntryURL = await this.KnowledgeBaseDetailRepo.findOne({
            where: { url, user: { id: user.id } },
            relations: ['user'],
        });

        const page = await browser.newPage();

        try {
            await page.setUserAgent(
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
            );
            await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-US,en;q=0.9' });
            await page.setDefaultNavigationTimeout(60000);
            await page.goto(url, { waitUntil: 'networkidle2' });

            // Reject cookie popups
            const rejectButtonSelectors = [
                '[id*="reject"]', '[id*="deny"]', '[id*="decline"]',
                '[class*="reject"]', '[class*="deny"]', '[class*="decline"]',
                'button:text("Reject")', 'button:text("Deny")', 'button:text("Decline")',
                'button:text("Manage preferences")'
            ];
            for (const selector of rejectButtonSelectors) {
                try {
                    const button = await page.waitForSelector(selector, { timeout: 2000 });
                    if (button) {
                        await button.click();
                        this.logger.log(`✅ Rejected popup on ${url}`);
                        break;
                    }
                } catch { }
            }

            // Scroll entire page to trigger lazy loading
            await page.evaluate(async () => {
                const delay = (ms: number) => new Promise(res => setTimeout(res, ms));
                let totalHeight = 0;
                const distance = 500;
                while (totalHeight < document.body.scrollHeight) {
                    window.scrollBy(0, distance);
                    await delay(500);
                    totalHeight += distance;
                }
            });

            // Remove sticky/fixed overlays
            await page.evaluate(() => {
                const elements = Array.from(document.querySelectorAll('*')) as HTMLElement[];

                // Remove cookie overlays and full-screen modals
                const overlaySelectors = [
                    '[class*="overlay"]', '[id*="overlay"]',
                    '.popup', '.modal', '[class*="cookie"]', '[id*="cookie"]',
                    '[class*="consent"]', '[id*="consent"]'
                ];
                overlaySelectors.forEach(selector => {
                    document.querySelectorAll(selector).forEach(el => el.remove());
                });

                // Ensure sticky headers are visible (don't forcibly un-stick)
                const headers = document.querySelectorAll('header') as NodeListOf<HTMLElement>;
                headers.forEach(header => {
                    header.style.position = 'relative';
                    header.style.top = '0';
                    header.style.left = '0';
                    header.style.width = '100%';
                    header.style.zIndex = '9999';
                    header.style.display = 'block';
                    header.style.visibility = 'visible';
                    header.style.opacity = '1';
                });               
            });

            await page.evaluate(() => window.scrollTo(0, 0));
            await new Promise(res => setTimeout(res, 2000));
                     
            const pdfBuffer: Buffer = Buffer.from(await page.pdf({
                format: 'A4',
                printBackground: true,
                // margin: { top: '20px', bottom: '20px', left: '20px', right: '20px' },
            }));

            // Metadata
            const title = await page.title();
            const content = await page.evaluate(() => document.body?.innerText || '');
            const language = langdetect.detectOne(content.slice(0, 5000)) || 'unknown';
            const safeTitle = (title || 'page').replace(/[^a-zA-Z0-9-_]/g, '_').slice(0, 300);
            const filename = `${safeTitle}.pdf`;
            const folderName = user?.id?.toString();
            const pdfDir = path.resolve(__dirname, '..', '..', 'screenshots', folderName);
            if (!fs.existsSync(pdfDir)) fs.mkdirSync(pdfDir, { recursive: true });
            const pdfPath = path.join(pdfDir, filename);

            // Save PDF and entry
            if (!existingEntryURL) {
                fs.writeFileSync(pdfPath, pdfBuffer);
                this.logger.log(`📄 PDF saved at: ${pdfPath}`);

                await this.KnowledgeBaseDetailRepo.save(
                    this.KnowledgeBaseDetailRepo.create({
                        url,
                        title,
                        content,
                        language,
                        s3path: pdfPath,
                        status: true,
                        uploadDocument: false,
                        user,
                        parent,
                    }),
                );
                this.logger.log(`✅ Saved crawl entry for ${url}`);
            }

            // Recursive crawl internal links
            if (depth > 0) {
                const internalLinks: string[] = await page.evaluate((currentUrl) => {
                    const anchors = Array.from(document.querySelectorAll('a[href]'));
                    return anchors
                        .map(a => a.getAttribute('href'))
                        .filter(Boolean)
                        .map(href => {
                            try {
                                return new URL(href!, currentUrl).href;
                            } catch {
                                return null;
                            }
                        })
                        .filter(href => href && href.startsWith(new URL(currentUrl).origin))
                        .map(href => href!.split('#')[0])
                        .filter((v, i, self) => self.indexOf(v) === i);
                }, url);

                for (const link of internalLinks) {
                    if (!visited.has(link)) {
                        await this.crawlPage(browser, link, user, depth - 1, visited, parent);
                    }
                }
            }

        } catch (error) {
            this.logger.warn(`⚠️ Error crawling ${url}: ${error.message}`);
        } finally {
            await page.close();
        }
    }

Crawl Page And Capture Screenshot PNG


This function does :

  • Visit a URL
  • Scroll to load dynamic content
  • Dismiss cookie popups
  • Take a full-page screenshot
  • Extract metadata (title, content, language)
  • Save it locally and/or to the database
  • Recursively crawl internal links


private async crawlPageImage(
        browser: Browser,
        url: string,
        user: User,
        depth: number,
        visited: Set<string>,
        parent: any,
    ) {
        if (visited.has(url) || depth < 0) return;
        visited.add(url);

        this.logger.log(`🔍 Crawling URL: ${url} (Depth: ${depth})`);

        const existingEntryURL = await this.KnowledgeBaseDetailRepo.findOne({
            where: {
                url,
                user: { id: user.id },
            },
            relations: ['user'],
        });

        const page = await browser.newPage();

        try {
            await page.setUserAgent(
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
            );
            await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-US,en;q=0.9' });
            await page.setDefaultNavigationTimeout(60000);

            await page.goto(url, { waitUntil: 'networkidle2' });

            // Dismiss cookie modals and popups
            const rejectButtonSelectors = [
                '[id*="reject"]', '[id*="deny"]', '[id*="decline"]',
                '[class*="reject"]', '[class*="deny"]', '[class*="decline"]',
                'button:text("Reject")', 'button:text("Deny")', 'button:text("Decline")',
                'button:text("Manage preferences")'
            ];

            for (const selector of rejectButtonSelectors) {
                try {
                    const button = await page.waitForSelector(selector, { timeout: 2000 });
                    if (button) {
                        await button.click();
                        this.logger.log(`✅ Rejected popup on ${url}`);
                        break;
                    }
                } catch {
                    continue;
                }
            }

            // Scroll slowly to bottom (simulate user scroll to load dynamic content)
            await page.evaluate(async () => {
                const delay = (ms: number) => new Promise((res) => setTimeout(res, ms));
                let totalHeight = 0;
                const distance = 200;
                while (totalHeight < document.body.scrollHeight) {
                    window.scrollBy(0, distance);
                    await delay(500);
                    totalHeight += distance;
                }
            });

            // Scroll to top again for clean screenshot
            await page.evaluate(() => window.scrollTo(0, 0));
            await new Promise((res) => setTimeout(res, 4000));

            // Optional: Force header to be visible (if some JS hides it)
            await page.evaluate(() => {
                const header = document.querySelector('header');
                if (header) {
                    header.style.position = 'relative';
                    header.style.top = '0';
                    header.style.left = '0';
                    header.style.width = '100%';
                    header.style.zIndex = '9999';
                    header.style.display = 'block';
                    header.style.visibility = 'visible';
                    header.style.opacity = '1';
                }
            });

            const { title, content } = await page.evaluate(() => ({
                title: document.title || 'Untitled',
                content: document.body?.innerText || '',
            }));

            const language = langdetect.detectOne(content.slice(0, 5000)) || 'unknown';
            const screenshotBuffer: Buffer = (await page.screenshot({ fullPage: true })) as Buffer;
            const filename = `${(title || 'page')
                .replace(/[^a-zA-Z0-9-_]/g, '_')
                .slice(0, 300)}.png`;

            const folderName = user?.id?.toString();
            const screenshotsDir = path.resolve(__dirname, '..', '..', 'screenshots', folderName);
            if (!fs.existsSync(screenshotsDir)) {
                fs.mkdirSync(screenshotsDir, { recursive: true });
            }

            if (!existingEntryURL) {
                const s3path = path.join(screenshotsDir, filename);
                fs.writeFileSync(s3path, screenshotBuffer);
                // const s3path = await this.s3Service.uploadBuffer(screenshotBuffer, filename, folderName);

                await this.KnowledgeBaseDetailRepo.save(
                    this.KnowledgeBaseDetailRepo.create({
                        url,
                        title,
                        content,
                        language,
                        s3path,
                        status: true,
                        uploadDocument: false,
                        user,
                        parent
                    }),
                );

                this.logger.log(`✅ Saved crawled data: ${url}`);
            }

            if (depth > 0) {
                const internalLinks: string[] = await page.evaluate((currentUrl) => {
                    const anchors = Array.from(document.querySelectorAll('a[href]'));
                    const urls = anchors
                        .map((a) => a.getAttribute('href'))
                        .filter(Boolean)
                        .map((href) => {
                            try {
                                return new URL(href!, currentUrl).href;
                            } catch (e) {
                                return null;
                            }
                        })
                        .filter((href) => href !== null)
                        .filter((href) => href!.startsWith(new URL(currentUrl).origin))
                        .map((href) => href!.split('#')[0]) // remove hash fragments
                        .filter((v, i, self) => self.indexOf(v) === i); // deduplicate

                    return urls;
                }, url);

                for (const link of internalLinks) {
                    if (!visited.has(link)) {
                        await this.crawlPage(browser, link, user, depth - 1, visited, parent);
                    }
                }
            }
        } catch (error) {
            this.logger.warn(`⚠️ Failed to crawl ${url}: ${error.message}`);
        } finally {
            await page.close();
        }
    }

Friday, 27 December 2024

Clusters of Node.js

Clusters of Node.js

Node.js's cluster module, which allows for the creation of multiple instances of a Node.js application to utilize multiple CPU cores effectively.

Cluster is following round-robin approach.

Workload is equally distribute in our servers and Handling traffic.

Example of Cluster with Node.js

const express = require('express');
const cluster = require('cluster');
const os = require('os');

const PORT = 3300; // Server port
const numCPUs = os.cpus().length; // Total CPU cores available

if (cluster.isPrimary) {
  console.log(`Master process started with PID: ${process.pid}`);

  // Fork workers for each CPU core
  for (let i = 0; i < numCPUs; i++) {
    cluster.fork();
  }

  // Listen for worker exit and optionally restart
  cluster.on('exit', (worker, code, signal) => {
    console.log(`Worker ${worker.process.pid} exited. Starting a new worker...`);
    cluster.fork(); // Optionally replace the dead worker
  });
} else {
  const app = express();

  // Middleware to log requests
  app.use((req, res, next) => {
    console.log(`Request handled by process ${process.pid}`);
    next();
  });

  // Example routes
  app.get('/', (req, res) => {
    res.send({
      message: `Handled by process ${process.pid}`,
      route: '/',
    });
  });

  app.get('/about', (req, res) => {
    res.send({
      message: `Handled by process ${process.pid}`,
      route: '/about',
    });
  });

  app.get('/contact', (req, res) => {
    res.send({
      message: `Handled by process ${process.pid}`,
      route: '/contact',
    });
  });

  // Start the server
  app.listen(PORT, () => {
    console.log(`Worker ${process.pid} started. Listening on port ${PORT}`);
  });
}

Thursday, 19 December 2024

Node.js handle uncaught exceptions

Node.js handle uncaught exceptions Node.js handle uncaught exceptions Node.js handle uncaught exceptions

const express = require('express');

const app = express();

// Middleware to parse JSON requests
app.use(express.json());

// A sample route
app.get('/', (req, res) => {
    res.send('Welcome to the Express.js app!');
});

// An example of a route with a potential error
app.get('/error', (req, res) => {
    throw new Error('This is an uncaught exception!');
});

// Global error handling middleware
app.use((err, req, res, next) => {
    console.error('Error caught by middleware:', err.message);
    res.status(500).json({ message: 'Internal Server Error' });
});

// Start the server
const PORT = 3000;
const server = app.listen(PORT, () => {
    console.log(`Server is running on http://localhost:${PORT}`);
});

// Handle uncaught exceptions
process.on('uncaughtException', (err) => {
    console.error('Uncaught Exception:', err.message);
    console.error(err.stack);

    // Perform cleanup if necessary, then exit
    server.close(() => {
        console.log('Server closed due to uncaught exception');
        process.exit(1); // Exit with a failure code
    });
});

// Handle unhandled promise rejections
process.on('unhandledRejection', (reason, promise) => {
    console.error('Unhandled Rejection at:', promise, 'reason:', reason);

    // Perform cleanup if necessary
    server.close(() => {
        console.log('Server closed due to unhandled rejection');
        process.exit(1); // Exit with a failure code
    });
});

Node.js with Sample Mock API

 Node.js with Sample : Mock API using express

const express = require('express');
const app = express();
const PORT = 3000;

// Mock data
const users = [
  { id: 1, name: 'John Doe', email: 'john.doe@example.com' },
  { id: 2, name: 'Jane Smith', email: 'jane.smith@example.com' },
  { id: 3, name: 'Sam Johnson', email: 'sam.johnson@example.com' },
];

// Middleware
app.use(express.json());

// Routes
// Get all users
app.get('/api/users', (req, res) => {
  res.status(200).json(users);
});

// Get user by ID
app.get('/api/users/:id', (req, res) => {
  const userId = parseInt(req.params.id, 10);
  const user = users.find(u => u.id === userId);
  if (user) {
    res.status(200).json(user);
  } else {
    res.status(404).json({ message: 'User not found' });
  }
});

// Add a new user
app.post('/api/users', (req, res) => {
  const { name, email } = req.body;
  const newUser = {
    id: users.length + 1,
    name,
    email,
  };
  users.push(newUser);
  res.status(201).json(newUser);
});

// Update a user by ID
app.put('/api/users/:id', (req, res) => {
  const userId = parseInt(req.params.id, 10);
  const userIndex = users.findIndex(u => u.id === userId);
  if (userIndex !== -1) {
    const { name, email } = req.body;
    users[userIndex] = { id: userId, name, email };
    res.status(200).json(users[userIndex]);
  } else {
    res.status(404).json({ message: 'User not found' });
  }
});

// Delete a user by ID
app.delete('/api/users/:id', (req, res) => {
  const userId = parseInt(req.params.id, 10);
  const userIndex = users.findIndex(u => u.id === userId);
  if (userIndex !== -1) {
    users.splice(userIndex, 1);
    res.status(204).send(); // No content
  } else {
    res.status(404).json({ message: 'User not found' });
  }
});

// Start the server
app.listen(PORT, () => {
  console.log(`Mock API is running at http://localhost:${PORT}`);
});

Node.js with create server using Express with middleware function



Express using create server.

const express = require('express');
const app = express();

// Middleware to parse JSON body data
app.use(express.json());

// Custom Middleware to Modify Request Data
app.use((req, res, next) => {
  if (req.body && typeof req.body === 'object') {
    // Add a new property to the request body
    req.body.modified = true;

    // Log the modified request body
    console.log('Modified Request Data:', req.body);
  }
  next(); // Pass control to the next middleware/route handler
});

// Example Route to Test Middleware
app.post('/data', (req, res) => {
  res.send({
    message: 'Request received successfully!',
    requestData: req.body,
  });
});

// Start the Server
const PORT = 3000;
app.listen(PORT, () => {
  console.log(`Server is running on http://localhost:${PORT}`);
});