8000 Python-web-scraping/selenium-scraper.py at main · luminati-io/Python-web-scraping · GitHub
[go: up one dir, main page]

Skip to content
{"payload":{"allShortcutsEnabled":false,"fileTree":{"":{"items":[{"name":"scrapy_scraping","path":"scrapy_scraping","contentType":"directory"},{"name":"README.md","path":"README.md","contentType":"file"},{"name":"requests-beautifulsoup-scraper.py","path":"requests-beautifulsoup-scraper.py","contentType":"file"},{"name":"selenium-scraper.py","path":"selenium-scraper.py","contentType":"file"},{"name":"titles.csv","path":"titles.csv","contentType":"file"},{"name":"titles.json","path":"titles.json","contentType":"file"}],"totalCount":6}},"fileTreeProcessingTime":7.919051,"foldersToFetch":[],"incompleteFileTree":false,"repo":{"id":910415592,"defaultBranch":"main","name":"Python-web-scraping","ownerLogin":"luminati-io","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2024-12-31T07:59:38.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/19207323?v=4","public":true,"private":false,"isOrgOwned":true},"codeLineWrapEnabled":false,"symbolsExpanded":false,"treeExpanded":true,"refInfo":{"name":"main","listCacheKey":"v0:1735631979.826916","canEdit":false,"refType":"branch","currentOid":"0e51cb49162675938357832d37b5b581beee5d07"},"path":"selenium-scraper.py","currentUser":null,"blob":{"rawLines":["from selenium import webdriver\r","from selenium.webdriver.chrome.service import Service\r","from selenium.webdriver.chrome.options import Options\r","from selenium.webdriver.common.by import By\r","import csv\r","\r","# Set up the WebDriver that operates in headless mode\r","options = Options()\r","options.add_argument(\"--headless\")\r","driver = webdriver.Chrome(service=Service(), options=options)\r","\r","# URL of the page to scrape\r","url = \"https://en.wikipedia.org/wiki/Web_scraping\"\r","\r","# Open the URL in the browser\r","driver.get(url)\r","\r","# List to store the scraped titles\r","titles = []\r","\r","# List of header levels (h1, h2, h3, h4, h5)\r","title_level_list = [1, 2, 3, 4, 5]\r","\r","# Loop through each header level (h1, h2, h3, h4, h5)\r","for title_level in title_level_list:\r"," # Find all elements of the current header level using a CSS Selector\r"," title_elements = driver.find_elements(By.CSS_SELECTOR, f\"h{title_level}\")\r","\r"," # Loop through each title element found\r"," for title_element in title_elements:\r"," # Data extraction logic\r"," tag = title_element.tag_name\r"," text = title_element.text\r","\r"," # Create a dictionary with the tag and the title text\r"," title = {\r"," \"tag\": tag,\r"," \"title\": text,\r"," }\r","\r"," # Append the dictionary to the titles list\r"," titles.append(title)\r","\r","# Close the browser\r","driver.quit()\r","\r","# Open a CSV file to write the data\r","with open(\"titles.csv\", mode=\"w\", newline=\"\", encoding=\"utf-8\") as file:\r"," # Create a CSV writer object and specify the fieldnames (columns)\r"," writer = csv.DictWriter(file, fieldnames=[\"tag\", \"title\"])\r","\r"," # Write the header (column names) to the CSV file\r"," writer.writeheader()\r","\r"," # Write each row (dictionary) to the CSV file\r"," for row in titles:\r"," writer.writerow(row)\r"],"stylingDirectives":null,"colorizedLines":null,"csv":null,"csvError":null,"dependabotInfo":{"showConfigurationBanner":false,"configFilePath":null,"networkDependabotPath":"/luminati-io/Python-web-scraping/network/updates","dismissConfigurationNoticePath":"/settings/dismiss-notice/dependabot_configuration_notice","configurationNoticeDismissed":null},"displayName":"selenium-scraper.py","displayUrl":"https://github.com/luminati-io/Python-web-scraping/blob/main/selenium-scraper.py?raw=true","headerInfo":{"blobSize":"1.75 KB","deleteTooltip":"You must be signed in to make or propose changes","editTooltip":"You must be signed in to make or propose changes","ghDesktopPath":"https://desktop.github.com","isGitLfs":false,"onBranch":true,"shortPath":"0fd7cd9","siteNavLoginPath":"/login?return_to=https%3A%2F%2Fgithub.com%2Fluminati-io%2FPython-web-scraping%2Fblob%2Fmain%2Fselenium-scraper.py","isCSV":false,"isRichtext":false,"toc":null,"lineInfo":{"truncatedLoc":"57","truncatedSloc":"44"},"mode":"file"},"image":false,"isCodeownersFile":null,"isPlain":false,"isValidLegacyIssueTemplate":false,"issueTemplate":null,"discussionTemplate":null,"language":"Python","languageID":303,"large":false,"planSupportInfo":{"repoIsFork":null,"repoOwnedByCurrentUser":null,"requestFullPath":"/luminati-io/Python-web-scraping/blob/main/selenium-scraper.py","showFreeOrgGatedFeatureMessage":null,"showPlanSupportBanner":null,"upgradeDataAttributes":null,"upgradePath":null},"publishBannersInfo":{"dismissActionNoticePath":"/settings/dismiss-notice/publish_action_from_dockerfile","releasePath":"/luminati-io/Python-web-scraping/releases/new?marketplace=true","showPublishActionBanner":false},"rawBlobUrl":"https://github.com/luminati-io/Python-web-scraping/raw/refs/heads/main/selenium-scraper.py","renderImageOrRaw":false,"richText":null,"renderedFileInfo":null,"shortPath":null,"symbolsEnabled":true,"tabSize":8,"topBannersInfo":{"overridingGlobalFundingFile":false,"globalPreferredFundingPath":null,"showInvalidCitationWarning":false,"citationHelpUrl":"https://docs.github.com/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-citation-files","actionsOnboardingTip":null},"truncated":false,"viewable":true,"workflowRedirectUrl":null,"symbols":null},"copilotInfo":null,"copilotAccessAllowed":false,"modelsAccessAllowed":false,"modelsRepoIntegrationEnabled":false,"csrf_tokens":{"/luminati-io/Python-web-scraping/branches":{"post":"ap_T1JahDBXJYGuPTISILKbxN4u1GeztoZeD1NlRvo4Bx9VOBeQo8n1mRiEq2BIWREt3c32fuY1_spJKxu1diA"},"/repos/preferences":{"post":"3uIpauXr00EJNE-8GHo7o9cgd7UvsSqKoFeH9SrRPoFwv2A1UIoEwAX2gUui2ES-GRzjgEsBnFKs_tTgpCaWQg"}}},"title":"Python-web-scraping/selenium-scraper.py at main · luminati-io/Python-web-scraping","appPayload":{"helpUrl":"https://docs.github.com","findFileWorkerPath":"/assets-cdn/worker/find-file-worker-263cab1760dd.js","findInFileWorkerPath":"/assets-cdn/worker/find-in-file-worker-b84e9496fc59.js","githubDevUrl":null,"enabled_features":{"code_nav_ui_events":false,"react_blob_overlay":false,"accessible_code_button":true}}}
0