This function does :
- Full-Page PDF Generation
- Recursive Internal Link Crawling
- Content and Metadata Extraction
- Intelligent Overlay & Popup Handling
- Auto Scroll to Load Dynamic Content
- Persistent File & DB Storage
- User and Context-Aware
- Timeout and Error Handling
private async crawlPage(
browser: Browser,
url: string,
user: User,
depth: number,
visited: Set<string>,
parent: any,
) {
if (visited.has(url) || depth < 0) return;
visited.add(url);
this.logger.log(`🔍 Crawling URL: ${url} (Depth: ${depth})`);
const existingEntryURL = await this.KnowledgeBaseDetailRepo.findOne({
where: { url, user: { id: user.id } },
relations: ['user'],
});
const page = await browser.newPage();
try {
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
);
await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-US,en;q=0.9' });
await page.setDefaultNavigationTimeout(60000);
await page.goto(url, { waitUntil: 'networkidle2' });
// Reject cookie popups
const rejectButtonSelectors = [
'[id*="reject"]', '[id*="deny"]', '[id*="decline"]',
'[class*="reject"]', '[class*="deny"]', '[class*="decline"]',
'button:text("Reject")', 'button:text("Deny")', 'button:text("Decline")',
'button:text("Manage preferences")'
];
for (const selector of rejectButtonSelectors) {
try {
const button = await page.waitForSelector(selector, { timeout: 2000 });
if (button) {
await button.click();
this.logger.log(`✅ Rejected popup on ${url}`);
break;
}
} catch { }
}
// Scroll entire page to trigger lazy loading
await page.evaluate(async () => {
const delay = (ms: number) => new Promise(res => setTimeout(res, ms));
let totalHeight = 0;
const distance = 500;
while (totalHeight < document.body.scrollHeight) {
window.scrollBy(0, distance);
await delay(500);
totalHeight += distance;
}
});
// Remove sticky/fixed overlays
await page.evaluate(() => {
const elements = Array.from(document.querySelectorAll('*')) as HTMLElement[];
// Remove cookie overlays and full-screen modals
const overlaySelectors = [
'[class*="overlay"]', '[id*="overlay"]',
'.popup', '.modal', '[class*="cookie"]', '[id*="cookie"]',
'[class*="consent"]', '[id*="consent"]'
];
overlaySelectors.forEach(selector => {
document.querySelectorAll(selector).forEach(el => el.remove());
});
// Ensure sticky headers are visible (don't forcibly un-stick)
const headers = document.querySelectorAll('header') as NodeListOf<HTMLElement>;
headers.forEach(header => {
header.style.position = 'relative';
header.style.top = '0';
header.style.left = '0';
header.style.width = '100%';
header.style.zIndex = '9999';
header.style.display = 'block';
header.style.visibility = 'visible';
header.style.opacity = '1';
});
});
await page.evaluate(() => window.scrollTo(0, 0));
await new Promise(res => setTimeout(res, 2000));
const pdfBuffer: Buffer = Buffer.from(await page.pdf({
format: 'A4',
printBackground: true,
// margin: { top: '20px', bottom: '20px', left: '20px', right: '20px' },
}));
// Metadata
const title = await page.title();
const content = await page.evaluate(() => document.body?.innerText || '');
const language = langdetect.detectOne(content.slice(0, 5000)) || 'unknown';
const safeTitle = (title || 'page').replace(/[^a-zA-Z0-9-_]/g, '_').slice(0, 300);
const filename = `${safeTitle}.pdf`;
const folderName = user?.id?.toString();
const pdfDir = path.resolve(__dirname, '..', '..', 'screenshots', folderName);
if (!fs.existsSync(pdfDir)) fs.mkdirSync(pdfDir, { recursive: true });
const pdfPath = path.join(pdfDir, filename);
// Save PDF and entry
if (!existingEntryURL) {
fs.writeFileSync(pdfPath, pdfBuffer);
this.logger.log(`📄 PDF saved at: ${pdfPath}`);
await this.KnowledgeBaseDetailRepo.save(
this.KnowledgeBaseDetailRepo.create({
url,
title,
content,
language,
s3path: pdfPath,
status: true,
uploadDocument: false,
user,
parent,
}),
);
this.logger.log(`✅ Saved crawl entry for ${url}`);
}
// Recursive crawl internal links
if (depth > 0) {
const internalLinks: string[] = await page.evaluate((currentUrl) => {
const anchors = Array.from(document.querySelectorAll('a[href]'));
return anchors
.map(a => a.getAttribute('href'))
.filter(Boolean)
.map(href => {
try {
return new URL(href!, currentUrl).href;
} catch {
return null;
}
})
.filter(href => href && href.startsWith(new URL(currentUrl).origin))
.map(href => href!.split('#')[0])
.filter((v, i, self) => self.indexOf(v) === i);
}, url);
for (const link of internalLinks) {
if (!visited.has(link)) {
await this.crawlPage(browser, link, user, depth - 1, visited, parent);
}
}
}
} catch (error) {
this.logger.warn(`⚠️ Error crawling ${url}: ${error.message}`);
} finally {
await page.close();
}
}