Web Scraping with Python
Web scraping tools and techniques with Python. Includes BeautifulSoup, Scrapy, Selenium, and crawl4ai patterns.
BeautifulSoup - HTML Parsing
Installation
1pip install beautifulsoup4 requests lxml
Basic Usage
1import requests
2from bs4 import BeautifulSoup
3
4# Fetch page
5url = "https://example.com"
6response = requests.get(url)
7soup = BeautifulSoup(response.content, 'lxml')
8
9# Find elements
10title = soup.find('title').text
11links = soup.find_all('a')
12divs = soup.find_all('div', class_='content')
13
14# CSS selectors
15items = soup.select('.item')
16first_item = soup.select_one('#first')
17
18# Navigate
19parent = soup.find('div').parent
20siblings = soup.find('div').find_next_siblings()
21children = soup.find('div').children
22
23# Extract data
24for link in soup.find_all('a'):
25 href = link.get('href')
26 text = link.text.strip()
27 print(f"{text}: {href}")
Complete Example
1import requests
2from bs4 import BeautifulSoup
3import csv
4
5def scrape_products(url):
6 """Scrape product information from e-commerce site"""
7 headers = {
8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
9 }
10
11 response = requests.get(url, headers=headers)
12 soup = BeautifulSoup(response.content, 'lxml')
13
14 products = []
15
16 for item in soup.select('.product-item'):
17 product = {
18 'name': item.select_one('.product-name').text.strip(),
19 'price': item.select_one('.product-price').text.strip(),
20 'rating': item.select_one('.product-rating')['data-rating'],
21 'url': item.select_one('a')['href']
22 }
23 products.append(product)
24
25 return products
26
27def save_to_csv(products, filename='products.csv'):
28 """Save products to CSV file"""
29 with open(filename, 'w', newline='', encoding='utf-8') as f:
30 writer = csv.DictWriter(f, fieldnames=products[0].keys())
31 writer.writeheader()
32 writer.writerows(products)
33
34# Usage
35products = scrape_products('https://example.com/products')
36save_to_csv(products)
Scrapy - Web Scraping Framework
Installation
1pip install scrapy
Create Project
1scrapy startproject myproject
2cd myproject
3scrapy genspider example example.com
Spider Example
1# spiders/products_spider.py
2import scrapy
3
4class ProductsSpider(scrapy.Spider):
5 name = 'products'
6 allowed_domains = ['example.com']
7 start_urls = ['https://example.com/products']
8
9 custom_settings = {
10 'DOWNLOAD_DELAY': 1,
11 'CONCURRENT_REQUESTS': 4,
12 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
13 }
14
15 def parse(self, response):
16 """Parse product listing page"""
17 for product in response.css('.product-item'):
18 yield {
19 'name': product.css('.product-name::text').get().strip(),
20 'price': product.css('.product-price::text').get().strip(),
21 'rating': product.css('.product-rating::attr(data-rating)').get(),
22 'url': response.urljoin(product.css('a::attr(href)').get())
23 }
24
25 # Follow pagination
26 next_page = response.css('a.next-page::attr(href)').get()
27 if next_page:
28 yield response.follow(next_page, self.parse)
29
30 def parse_product(self, response):
31 """Parse individual product page"""
32 yield {
33 'name': response.css('h1.product-title::text').get(),
34 'price': response.css('.price::text').get(),
35 'description': response.css('.description::text').get(),
36 'images': response.css('.product-image::attr(src)').getall(),
37 'specifications': {
38 spec.css('.spec-name::text').get(): spec.css('.spec-value::text').get()
39 for spec in response.css('.specification')
40 }
41 }
Run Spider
1# Run spider
2scrapy crawl products
3
4# Save to JSON
5scrapy crawl products -o products.json
6
7# Save to CSV
8scrapy crawl products -o products.csv
9
10# Save to JSON Lines
11scrapy crawl products -o products.jl
Scrapy Settings
1# settings.py
2BOT_NAME = 'myproject'
3
4# Obey robots.txt
5ROBOTSTXT_OBEY = True
6
7# Configure delays
8DOWNLOAD_DELAY = 2
9RANDOMIZE_DOWNLOAD_DELAY = True
10
11# Concurrent requests
12CONCURRENT_REQUESTS = 16
13CONCURRENT_REQUESTS_PER_DOMAIN = 4
14
15# User agent
16USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
17
18# Middleware
19DOWNLOADER_MIDDLEWARES = {
20 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
21 'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
22}
23
24# Pipelines
25ITEM_PIPELINES = {
26 'myproject.pipelines.CleanDataPipeline': 300,
27 'myproject.pipelines.SaveToDBPipeline': 400,
28}
29
30# AutoThrottle
31AUTOTHROTTLE_ENABLED = True
32AUTOTHROTTLE_START_DELAY = 1
33AUTOTHROTTLE_MAX_DELAY = 10
Selenium - Browser Automation
Installation
1pip install selenium webdriver-manager
Basic Usage
1from selenium import webdriver
2from selenium.webdriver.common.by import By
3from selenium.webdriver.support.ui import WebDriverWait
4from selenium.webdriver.support import expected_conditions as EC
5from webdriver_manager.chrome import ChromeDriverManager
6from selenium.webdriver.chrome.service import Service
7
8# Setup driver
9options = webdriver.ChromeOptions()
10options.add_argument('--headless') # Run in background
11options.add_argument('--no-sandbox')
12options.add_argument('--disable-dev-shm-usage')
13
14service = Service(ChromeDriverManager().install())
15driver = webdriver.Chrome(service=service, options=options)
16
17try:
18 # Navigate to page
19 driver.get('https://example.com')
20
21 # Wait for element
22 wait = WebDriverWait(driver, 10)
23 element = wait.until(
24 EC.presence_of_element_located((By.CLASS_NAME, 'product-item'))
25 )
26
27 # Find elements
28 products = driver.find_elements(By.CLASS_NAME, 'product-item')
29
30 # Extract data
31 for product in products:
32 name = product.find_element(By.CLASS_NAME, 'product-name').text
33 price = product.find_element(By.CLASS_NAME, 'product-price').text
34 print(f"{name}: {price}")
35
36 # Interact with page
37 search_box = driver.find_element(By.NAME, 'q')
38 search_box.send_keys('laptop')
39 search_box.submit()
40
41 # Wait for results
42 wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'results')))
43
44 # Scroll to bottom
45 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
46
47 # Take screenshot
48 driver.save_screenshot('screenshot.png')
49
50finally:
51 driver.quit()
Handle Dynamic Content
1from selenium import webdriver
2from selenium.webdriver.common.by import By
3from selenium.webdriver.support.ui import WebDriverWait
4from selenium.webdriver.support import expected_conditions as EC
5import time
6
7def scrape_infinite_scroll(url):
8 """Scrape page with infinite scroll"""
9 driver = webdriver.Chrome()
10 driver.get(url)
11
12 products = []
13 last_height = driver.execute_script("return document.body.scrollHeight")
14
15 while True:
16 # Scroll down
17 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
18 time.sleep(2)
19
20 # Extract products
21 items = driver.find_elements(By.CLASS_NAME, 'product-item')
22 for item in items:
23 products.append({
24 'name': item.find_element(By.CLASS_NAME, 'name').text,
25 'price': item.find_element(By.CLASS_NAME, 'price').text
26 })
27
28 # Check if reached bottom
29 new_height = driver.execute_script("return document.body.scrollHeight")
30 if new_height == last_height:
31 break
32 last_height = new_height
33
34 driver.quit()
35 return products
crawl4ai - AI-Powered Scraping
Installation
1pip install crawl4ai
Basic Usage
1from crawl4ai import WebCrawler
2
3# Initialize crawler
4crawler = WebCrawler()
5
6# Crawl page
7result = crawler.run(url="https://example.com")
8
9# Extract content
10print(result.markdown) # Markdown content
11print(result.cleaned_html) # Cleaned HTML
12print(result.media) # Images, videos
13print(result.links) # All links
14
15# Extract structured data
16result = crawler.run(
17 url="https://example.com/products",
18 extraction_strategy="css",
19 css_selector=".product-item"
20)
21
22for item in result.extracted_content:
23 print(item)
Advanced Usage
1from crawl4ai import WebCrawler, CrawlerRunConfig
2from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
3
4# Define extraction schema
5schema = {
6 "name": "product",
7 "baseSelector": ".product-item",
8 "fields": [
9 {
10 "name": "title",
11 "selector": ".product-name",
12 "type": "text"
13 },
14 {
15 "name": "price",
16 "selector": ".product-price",
17 "type": "text"
18 },
19 {
20 "name": "image",
21 "selector": "img",
22 "type": "attribute",
23 "attribute": "src"
24 }
25 ]
26}
27
28# Configure crawler
29config = CrawlerRunConfig(
30 extraction_strategy=JsonCssExtractionStrategy(schema),
31 wait_for_selector=".product-item",
32 screenshot=True,
33 verbose=True
34)
35
36# Run crawler
37crawler = WebCrawler()
38result = crawler.run(url="https://example.com/products", config=config)
39
40# Process results
41for product in result.extracted_content:
42 print(f"{product['title']}: {product['price']}")
Anti-Scraping Bypass
Rotate User Agents
1import random
2
3USER_AGENTS = [
4 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
5 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
6 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
7]
8
9headers = {
10 'User-Agent': random.choice(USER_AGENTS)
11}
12
13response = requests.get(url, headers=headers)
Use Proxies
1import requests
2
3proxies = {
4 'http': 'http://proxy:port',
5 'https': 'http://proxy:port'
6}
7
8response = requests.get(url, proxies=proxies)
9
10# Rotating proxies
11PROXY_LIST = ['proxy1:port', 'proxy2:port', 'proxy3:port']
12
13for url in urls:
14 proxy = random.choice(PROXY_LIST)
15 proxies = {'http': f'http://{proxy}', 'https': f'http://{proxy}'}
16 response = requests.get(url, proxies=proxies)
Handle Rate Limiting
1import time
2from functools import wraps
3
4def rate_limit(delay=1):
5 """Decorator to add delay between requests"""
6 def decorator(func):
7 @wraps(func)
8 def wrapper(*args, **kwargs):
9 time.sleep(delay)
10 return func(*args, **kwargs)
11 return wrapper
12 return decorator
13
14@rate_limit(delay=2)
15def fetch_page(url):
16 return requests.get(url)
Handle CAPTCHAs
1# Use 2captcha or similar service
2from twocaptcha import TwoCaptcha
3
4solver = TwoCaptcha('YOUR_API_KEY')
5
6try:
7 result = solver.recaptcha(
8 sitekey='6Le-wvkSAAAAAPBMRTvw0Q4Muexq9bi0DJwx_mJ-',
9 url='https://example.com'
10 )
11 print(f"Solved: {result['code']}")
12except Exception as e:
13 print(f"Error: {e}")
Best Practices
1# ✅ Respect robots.txt
2from urllib.robotparser import RobotFileParser
3
4rp = RobotFileParser()
5rp.set_url("https://example.com/robots.txt")
6rp.read()
7
8if rp.can_fetch("*", "https://example.com/page"):
9 # Scrape page
10 pass
11
12# ✅ Add delays
13time.sleep(random.uniform(1, 3))
14
15# ✅ Use session for efficiency
16session = requests.Session()
17response = session.get(url)
18
19# ✅ Handle errors gracefully
20try:
21 response = requests.get(url, timeout=10)
22 response.raise_for_status()
23except requests.exceptions.RequestException as e:
24 print(f"Error: {e}")
25
26# ✅ Save progress
27import json
28
29def save_checkpoint(data, filename='checkpoint.json'):
30 with open(filename, 'w') as f:
31 json.dump(data, f)
32
33def load_checkpoint(filename='checkpoint.json'):
34 try:
35 with open(filename, 'r') as f:
36 return json.load(f)
37 except FileNotFoundError:
38 return []