Anthropic Claudeソフトウェア開発⭐ リポ 0品質スコア 50/100

beautifulsoup-parsing

Name: beautifulsoup-parsing
Author: mindrally

BeautifulSoupを使ったHTML/XMLの解析に関する専門的なガイダンスを提供し、DOMナビゲーション、データ抽出、効率的なスクレイピングワークフローのベストプラクティスをサポートします。

description の原文を見る

Expert guidance for HTML/XML parsing using BeautifulSoup in Python with best practices for DOM navigation, data extraction, and efficient scraping workflows.

SKILL.md 本文

BeautifulSoup HTML パース

BeautifulSoup、Python の HTML/XML パース、DOM ナビゲーション、Web スクレイピングのための効率的なデータ抽出パイプライン構築の専門家です。

中心的な専門知識

BeautifulSoup API とパース方法
CSS セレクタと find メソッド
DOM トラバーサルとナビゲーション
異なるパーサーを使用した HTML/XML パース
requests ライブラリとの統合
不正形式な HTML の適切な処理
データ抽出パターンとベストプラクティス
メモリ効率的な処理

主な原則

簡潔で技術的なコードを正確な Python の例とともに作成
可読性、効率性、保守性を優先
一般的な抽出タスク用にモジュール化された再利用可能な関数を使用
適切なデフォルト値で欠落データを適切に処理
PEP 8 スタイルガイドに従う
ロバストなスクレイピングのための適切なエラーハンドリングを実装

基本セットアップ

pip install beautifulsoup4 requests lxml

HTML の読み込み

from bs4 import BeautifulSoup
import requests

# 文字列から
html = '<html><body><h1>Hello</h1></body></html>'
soup = BeautifulSoup(html, 'lxml')

# ファイルから
with open('page.html', 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f, 'lxml')

# URL から
response = requests.get('https://example.com')
soup = BeautifulSoup(response.content, 'lxml')

パーサーオプション

# lxml - 高速、緩い (推奨)
soup = BeautifulSoup(html, 'lxml')

# html.parser - 組み込み、依存関係なし
soup = BeautifulSoup(html, 'html.parser')

# html5lib - 最も緩い、最も遅い
soup = BeautifulSoup(html, 'html5lib')

# lxml-xml - XML ドキュメント用
soup = BeautifulSoup(xml, 'lxml-xml')

要素の検索

タグで検索

# 最初にマッチした要素
soup.find('h1')

# すべてのマッチした要素
soup.find_all('p')

# 短縮形
soup.h1  # soup.find('h1') と同じ

属性で検索

# クラスで検索
soup.find('div', class_='article')
soup.find_all('div', class_='article')

# ID で検索
soup.find(id='main-content')

# 任意の属性で検索
soup.find('a', href='https://example.com')
soup.find_all('input', attrs={'type': 'text', 'name': 'email'})

# データ属性で検索
soup.find('div', attrs={'data-id': '123'})

CSS セレクタ

# 単一の要素
soup.select_one('div.article > h2')

# 複数の要素
soup.select('div.article h2')

# 複雑なセレクタ
soup.select('a[href^="https://"]')  # 〜で始まる
soup.select('a[href$=".pdf"]')      # 〜で終わる
soup.select('a[href*="example"]')   # 〜を含む
soup.select('li:nth-child(2)')
soup.select('h1, h2, h3')           # 複数

関数で検索

import re

# 正規表現で検索
soup.find_all('a', href=re.compile(r'^https://'))

# 関数で検索
def has_data_attr(tag):
    return tag.has_attr('data-id')

soup.find_all(has_data_attr)

# 文字列マッチング
soup.find_all(string='exact text')
soup.find_all(string=re.compile('pattern'))

データの抽出

テキストコンテンツ

# テキストを取得
element.text
element.get_text()

# セパレータ付きでテキストを取得
element.get_text(separator=' ')

# トリミングされたテキストを取得
element.get_text(strip=True)

# 文字列を取得 (ジェネレータ)
for string in element.stripped_strings:
    print(string)

属性

# 属性を取得
element['href']
element.get('href')  # 欠落している場合は None を返す
element.get('href', 'default')  # デフォルト値付き

# すべての属性を取得
element.attrs  # dict を返す

# 属性の存在を確認
element.has_attr('class')

HTML コンテンツ

# 内部 HTML
str(element)

# タグのみ
element.name

# 整形された HTML
element.prettify()

DOM ナビゲーション

親/祖先

element.parent
element.parents  # すべての祖先のジェネレータ

# 特定の祖先を検索
for parent in element.parents:
    if parent.name == 'div' and 'article' in parent.get('class', []):
        break

子要素

element.children      # 直下の子要素 (ジェネレータ)
list(element.children)

element.contents      # 直下の子要素 (リスト)
element.descendants   # すべての子孫 (ジェネレータ)

# 子要素内で検索
element.find('span')  # 子孫を検索

兄弟要素

element.next_sibling
element.previous_sibling

element.next_siblings      # ジェネレータ
element.previous_siblings  # ジェネレータ

# 次/前の要素 (空白をスキップ)
element.next_element
element.previous_element

データ抽出パターン

安全な抽出

def safe_text(element, selector, default=''):
    """要素からテキストを安全に抽出。"""
    found = element.select_one(selector)
    return found.get_text(strip=True) if found else default

def safe_attr(element, selector, attr, default=None):
    """要素から属性を安全に抽出。"""
    found = element.select_one(selector)
    return found.get(attr, default) if found else default

テーブルの抽出

def extract_table(table):
    """テーブルデータを辞書のリストとして抽出。"""
    headers = [th.get_text(strip=True) for th in table.select('th')]

    rows = []
    for tr in table.select('tbody tr'):
        cells = [td.get_text(strip=True) for td in tr.select('td')]
        if cells:
            rows.append(dict(zip(headers, cells)))

    return rows

リストの抽出

def extract_items(soup, selector, extractor):
    """カスタム抽出関数を使用して複数アイテムを抽出。"""
    return [extractor(item) for item in soup.select(selector)]

# 使用例
def extract_product(item):
    return {
        'name': safe_text(item, '.name'),
        'price': safe_text(item, '.price'),
        'url': safe_attr(item, 'a', 'href')
    }

products = extract_items(soup, '.product', extract_product)

URL の解決

from urllib.parse import urljoin

def resolve_url(base_url, relative_url):
    """相対 URL を絶対 URL に変換。"""
    if not relative_url:
        return None
    return urljoin(base_url, relative_url)

# 使用例
base_url = 'https://example.com/products/'
for link in soup.select('a'):
    href = link.get('href')
    absolute_url = resolve_url(base_url, href)
    print(absolute_url)

不正形式な HTML の処理

# lxml パーサーは不正形式な HTML に対して寛容
soup = BeautifulSoup(malformed_html, 'lxml')

# 非常に壊れた HTML の場合は html5lib を使用
soup = BeautifulSoup(very_broken_html, 'html5lib')

# エンコードの問題に対応
response = requests.get(url)
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'lxml')

完全なスクレイピング例

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

class ProductScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)'
        })

    def fetch_page(self, url):
        """ページを取得してパース。"""
        response = self.session.get(url, timeout=30)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'lxml')

    def extract_product(self, item):
        """カード要素から商品データを抽出。"""
        return {
            'name': self._safe_text(item, '.product-title'),
            'price': self._parse_price(item.select_one('.price')),
            'rating': self._safe_attr(item, '.rating', 'data-rating'),
            'image': self._resolve(self._safe_attr(item, 'img', 'src')),
            'url': self._resolve(self._safe_attr(item, 'a', 'href')),
            'in_stock': not item.select_one('.out-of-stock')
        }

    def scrape_products(self, url):
        """ページからすべての商品をスクレイプ。"""
        soup = self.fetch_page(url)
        items = soup.select('.product-card')
        return [self.extract_product(item) for item in items]

    def _safe_text(self, element, selector, default=''):
        found = element.select_one(selector)
        return found.get_text(strip=True) if found else default

    def _safe_attr(self, element, selector, attr, default=None):
        found = element.select_one(selector)
        return found.get(attr, default) if found else default

    def _parse_price(self, element):
        if not element:
            return None
        text = element.get_text(strip=True)
        try:
            return float(text.replace('$', '').replace(',', ''))
        except ValueError:
            return None

    def _resolve(self, url):
        return urljoin(self.base_url, url) if url else None


# 使用例
scraper = ProductScraper('https://example.com')
products = scraper.scrape_products('https://example.com/products')
for product in products:
    print(product)

パフォーマンス最適化

# SoupStrainer を使用して必要な要素のみをパース
from bs4 import SoupStrainer

only_articles = SoupStrainer('article')
soup = BeautifulSoup(html, 'lxml', parse_only=only_articles)

# 速度のために lxml パーサーを使用
soup = BeautifulSoup(html, 'lxml')  # 最速

# 不要な要素を分解
for script in soup.find_all('script'):
    script.decompose()

# メモリ効率のためにジェネレータを使用
for item in soup.select('.item'):
    yield extract_data(item)

主な依存関係

beautifulsoup4
lxml (高速パーサー)
html5lib (寛容なパーサー)
requests
pandas (データ出力用)

ベストプラクティス

常に最高のパフォーマンスのために lxml パーサーを使用
デフォルト値を使用して欠落要素に対応
CSS セレクタには select() と select_one() を使用
清潔なテキスト抽出には get_text(strip=True) を使用
相対 URL を絶対 URL に解決
抽出されたデータ型を検証
リクエスト間でレート制限を実装
適切な User-Agent ヘッダーを使用
文字エンコーディングを適切に処理
大規模ドキュメント用に SoupStrainer を使用
robots.txt とウェブサイトの利用規約に従う
失敗したリクエスト用にリトライロジックを実装

ライセンス: Apache-2.0(寛容ライセンスのため全文を引用しています) · 原本リポジトリ

詳細情報

作者: mindrally
リポジトリ: mindrally/skills
ライセンス: Apache-2.0
最終更新: 不明

GitHubで原本を見る →フィードバックを送る

Source: https://github.com/mindrally/skills / ライセンス: Apache-2.0