[go: up one dir, main page]

0% found this document useful (0 votes)
5 views6 pages

Striver Scraper.py

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views6 pages

Striver Scraper.py

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

const puppeteer = require('puppeteer');

const fs = require('fs').promises;
const PDFDocument = require('pdfkit');

class StriverSDEScraper {
constructor() {
this.baseUrl = 'https://takeuforward.org';
this.sheetUrl = 'https://takeuforward.org/interviews/strivers-sde-sheet-
top-coding-interview-problems';
this.questionsData = [];
this.browser = null;
this.page = null;
}

async init() {
console.log('🚀 Initializing browser...');
this.browser = await puppeteer.launch({
headless: false, // Set to true for production
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
this.page = await this.browser.newPage();

// Set user agent


await this.page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
}

async getQuestionLinks() {
console.log('🔍 Fetching SDE sheet page...');

try {
await this.page.goto(this.sheetUrl, { waitUntil: 'networkidle2' });

// Extract question links


const questionLinks = await this.page.evaluate(() => {
const links = [];
const anchors =
document.querySelectorAll('a[href*="takeuforward.org"]');

anchors.forEach(anchor => {
const href = anchor.href;
const text = anchor.textContent.trim();

// Filter for actual question links


if (href && text &&
(href.includes('data-structure') ||
href.includes('algorithm') ||
href.includes('problem') ||
href.includes('solution'))) {
links.push({
title: text,
url: href
});
}
});

// Remove duplicates
const uniqueLinks = [];
const seen = new Set();
links.forEach(link => {
if (!seen.has(link.url)) {
seen.add(link.url);
uniqueLinks.push(link);
}
});

return uniqueLinks;
});

console.log(`✅ Found ${questionLinks.length} question links`);


return questionLinks;

} catch (error) {
console.error('❌ Error fetching question links:', error);
return [];
}
}

async scrapeQuestionContent(questionUrl, title) {


console.log(`📖 Scraping: ${title}`);

try {
await this.page.goto(questionUrl, { waitUntil: 'networkidle2' });

const questionData = await this.page.evaluate(() => {


const data = {
title: document.title || 'Unknown',
description: '',
approaches: [],
timeComplexity: '',
spaceComplexity: ''
};

// Extract description
const contentDiv = document.querySelector('.entry-content, .post-
content, .content');
if (contentDiv) {
const paragraphs = contentDiv.querySelectorAll('p');
const descParagraphs = Array.from(paragraphs)
.slice(0, 3)
.map(p => p.textContent.trim())
.filter(text => text.length > 20);
data.description = descParagraphs.join('\n');
}

// Extract code blocks


const codeBlocks = document.querySelectorAll('pre,
code, .highlight');
const approaches = [];

codeBlocks.forEach((block, index) => {


const codeText = block.textContent.trim();
if (codeText.length > 50) {
let approachName = 'Solution';
if (index === 0) {
approachName = 'Brute Force';
} else if (codeText.toLowerCase().includes('optimal') ||
index === codeBlocks.length - 1) {
approachName = 'Optimal Solution';
} else if (index > 0) {
approachName = `Approach ${index + 1}`;
}

approaches.push({
name: approachName,
code: codeText,
timeComplexity: extractComplexity(codeText, 'time'),
spaceComplexity: extractComplexity(codeText, 'space')
});
}
});

data.approaches = approaches;

// Extract complexity from text


const fullText = document.body.textContent.toLowerCase();
data.timeComplexity = extractComplexityFromText(fullText, 'time');
data.spaceComplexity = extractComplexityFromText(fullText,
'space');

// Helper functions
function extractComplexity(text, type) {
const patterns = [
/O\([^)]+\)/gi,
/time[:\s]*O\([^)]+\)/gi,
/space[:\s]*O\([^)]+\)/gi
];

for (const pattern of patterns) {


const matches = text.match(pattern);
if (matches) {
return matches[0];
}
}
return 'Not specified';
}

function extractComplexityFromText(text, type) {


let patterns;
if (type === 'time') {
patterns = [/time complexity[:\s]*O\([^)]+\)/gi, /time[:\
s]*O\([^)]+\)/gi];
} else {
patterns = [/space complexity[:\s]*O\([^)]+\)/gi, /space[:\
s]*O\([^)]+\)/gi];
}

for (const pattern of patterns) {


const match = text.match(pattern);
if (match) {
const complexityMatch = match[0].match(/O\([^)]+\)/);
if (complexityMatch) {
return complexityMatch[0];
}
}
}
return 'Not specified';
}

return data;
});

questionData.title = title;
questionData.url = questionUrl;

// Wait between requests


await new Promise(resolve => setTimeout(resolve, 2000));

return questionData;

} catch (error) {
console.error(`❌ Error scraping ${title}:`, error);
return null;
}
}

async scrapeAllQuestions() {
console.log('🚀 Starting Striver SDE Sheet scraping...');

await this.init();

try {
// Get all question links
const questionLinks = await this.getQuestionLinks();

if (questionLinks.length === 0) {
console.log('❌ No question links found!');
return;
}

console.log(`📚 Found ${questionLinks.length} questions to scrape`);

// Scrape each question


for (let i = 0; i < questionLinks.length; i++) {
const link = questionLinks[i];
console.log(`\n[${i + 1}/${questionLinks.length}] Processing...`);

const questionData = await this.scrapeQuestionContent(link.url,


link.title);

if (questionData) {
this.questionsData.push(questionData);
console.log(`✅ Scraped: ${questionData.title}`);
} else {
console.log(`❌ Failed to scrape: ${link.title}`);
}

// Progress update
if ((i + 1) % 10 === 0) {
console.log(`\n📊 Progress: ${i + 1}/${questionLinks.length}
questions completed`);
}
}

console.log(`\n🎉 Scraping completed! Total questions: $


{this.questionsData.length}`);

// Save to JSON
await this.saveToJson();

// Generate PDF
await this.generatePDF();

} finally {
if (this.browser) {
await this.browser.close();
}
}
}

async saveToJson() {
const filename = 'striver_sde_questions.json';
await fs.writeFile(filename, JSON.stringify(this.questionsData, null, 2));
console.log(`💾 Data saved to ${filename}`);
}

async generatePDF() {
console.log('📄 Generating PDF...');

try {
const filename = 'Striver_SDE_Sheet_Complete.pdf';
const doc = new PDFDocument();
const stream = require('fs').createWriteStream(filename);
doc.pipe(stream);

// Title page
doc.fontSize(20).text('Striver SDE Sheet - Complete Solutions', 50,
50);
doc.fontSize(12).text(`Total Questions: ${this.questionsData.length}`,
50, 100);
doc.text(`Generated on: ${new Date().toLocaleDateString()}`, 50, 120);

// Questions
let yPosition = 180;

this.questionsData.forEach((question, index) => {


// Check if new page needed
if (yPosition > 700) {
doc.addPage();
yPosition = 50;
}

// Question title
doc.fontSize(14).text(`${index + 1}. ${question.title}`, 50,
yPosition);
yPosition += 30;

// Description
if (question.description) {
doc.fontSize(10).text('Problem Description:', 50, yPosition);
yPosition += 15;
doc.text(question.description.substring(0, 500), 50, yPosition,
{ width: 500 });
yPosition += Math.ceil(question.description.length / 80) * 12 +
20;
}

// Approaches
question.approaches.forEach(approach => {
if (yPosition > 650) {
doc.addPage();
yPosition = 50;
}

doc.fontSize(12).text(approach.name, 60, yPosition);


yPosition += 20;

doc.fontSize(10).text(`Time: ${approach.timeComplexity} |
Space: ${approach.spaceComplexity}`, 60, yPosition);
yPosition += 15;

// Code (truncated for PDF)


const codeLines = approach.code.split('\n').slice(0, 10);
codeLines.forEach(line => {
if (yPosition > 750) {
doc.addPage();
yPosition = 50;
}
doc.text(line.substring(0, 80), 60, yPosition);
yPosition += 12;
});

yPosition += 20;
});

yPosition += 30;
});

doc.end();

stream.on('finish', () => {
console.log(`✅ PDF generated: ${filename}`);
});

} catch (error) {
console.error('❌ Error generating PDF:', error);
}
}
}

async function main() {


const scraper = new StriverSDEScraper();
await scraper.scrapeAllQuestions();
}

// Run the scraper


main().catch(console.error);

You might also like