hatokamome

hatokamomeの趣味・雑記録

wikipediaのタグごとのテキスト取得ツール（Python)

wikipediaで主要な文章だけ読みたい場合、HTMLタグを指定してテキストを取得できます。実行環境作成、ライブラリのインポートは、ご自身でする必要があります。

【コード】

import tkinter as tk
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
from bs4 import BeautifulSoup

# デフォルトのURL
default_url = 'https://ja.wikipedia.org/wiki/%E6%9B%B8%E9%81%93'

def analyze_tags(url):
    response = requests.get(url)
    html = response.content
    soup = BeautifulSoup(html, 'html.parser')
    tags = {tag.name for tag in soup.find_all()}
    return tags

def update_tags():
    url = url_entry.get()
    tags = analyze_tags(url)
    for widget in tag_frame.winfo_children():
        widget.destroy()
    tag_vars.clear()
    for i, tag in enumerate(sorted(tags)):
        if tag and tag.isalnum():  # 有効なタグのみを表示
            row, column = divmod(i, 5)  # 5列のグリッドレイアウト
            var = tk.BooleanVar()
            tag_vars[tag] = var
            check = tk.Checkbutton(tag_frame, text=tag, variable=var)
            check.grid(row=row, column=column, sticky="w")

def select_all_tags():
    for var in tag_vars.values():
        var.set(True)

def scrape_website():
    selected_tags = [tag for tag, var in tag_vars.items() if var.get()]

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    driver.get(url_entry.get())
    time.sleep(5)  # JavaScriptがロードされるのを待つ

    xpath_queries = [f"//{tag}" for tag in selected_tags]
    xpath_query = '|'.join(xpath_queries)
    elements = driver.find_elements(By.XPATH, xpath_query)

    output = [f"タグ: <{element.tag_name}>\nテキスト: {element.text}\n{'-'*40}\n" for element in elements]

    driver.quit()

    result_text.delete('1.0', tk.END)
    result_text.insert(tk.END, "\n".join(output))

# UIの設定
root = tk.Tk()
root.title("ウェブスクレイピングツール")

# モニタのサイズを取得し、ウィンドウサイズと位置を計算
screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()
window_width = screen_width // 2 - 100
window_height = screen_height - 100
window_x = 50
window_y = 50
root.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")

# URL入力フィールド
url_label = tk.Label(root, text="URL:")
url_label.pack()
url_entry = tk.Entry(root, width=50)
url_entry.pack()
url_entry.insert(0, default_url)  # デフォルトのURLをセット

# ボタンフレーム
button_frame = tk.Frame(root)
button_frame.pack()

# タグ更新ボタン
update_button = tk.Button(button_frame, text="タグ更新", command=update_tags)
update_button.pack(side=tk.LEFT)

# すべて選択ボタン
select_all_button = tk.Button(button_frame, text="すべて選択", command=select_all_tags)
select_all_button.pack(side=tk.LEFT)

# スクレイピング実行ボタン
scrape_button = tk.Button(button_frame, text="スクレイピング実行", command=scrape_website)
scrape_button.pack(side=tk.LEFT)

# タグ選択チェックボックスのフレーム
tag_frame = tk.Frame(root)
tag_frame.pack()
tag_vars = {}

# 結果表示テキストボックスとスクロールバー
result_frame = tk.Frame(root)
result_frame.pack(fill="both", expand=True)
result_scrollbar = tk.Scrollbar(result_frame, orient="vertical")
result_text = tk.Text(result_frame, yscrollcommand=result_scrollbar.set, wrap="word", spacing3=10)
result_scrollbar.config(command=result_text.yview)
result_scrollbar.pack(side="right", fill="y", expand=False)
result_text.pack(side="left", fill="both", expand=True)

# メインループ
root.mainloop()