wikipediaで主要な文章だけ読みたい場合、HTMLタグを指定してテキストを取得できます。
実行環境作成、ライブラリのインポートは、ご自身でする必要があります。
【コード】
import tkinter as tk
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
from bs4 import BeautifulSoup
default_url = 'https://ja.wikipedia.org/wiki/%E6%9B%B8%E9%81%93'
def analyze_tags(url):
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
tags = {tag.name for tag in soup.find_all()}
return tags
def update_tags():
url = url_entry.get()
tags = analyze_tags(url)
for widget in tag_frame.winfo_children():
widget.destroy()
tag_vars.clear()
for i, tag in enumerate(sorted(tags)):
if tag and tag.isalnum():
row, column = divmod(i, 5)
var = tk.BooleanVar()
tag_vars[tag] = var
check = tk.Checkbutton(tag_frame, text=tag, variable=var)
check.grid(row=row, column=column, sticky="w")
def select_all_tags():
for var in tag_vars.values():
var.set(True)
def scrape_website():
selected_tags = [tag for tag, var in tag_vars.items() if var.get()]
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.get(url_entry.get())
time.sleep(5)
xpath_queries = [f"//{tag}" for tag in selected_tags]
xpath_query = '|'.join(xpath_queries)
elements = driver.find_elements(By.XPATH, xpath_query)
output = [f"タグ: <{element.tag_name}>\nテキスト: {element.text}\n{'-'*40}\n" for element in elements]
driver.quit()
result_text.delete('1.0', tk.END)
result_text.insert(tk.END, "\n".join(output))
root = tk.Tk()
root.title("ウェブスクレイピングツール")
screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()
window_width = screen_width // 2 - 100
window_height = screen_height - 100
window_x = 50
window_y = 50
root.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
url_label = tk.Label(root, text="URL:")
url_label.pack()
url_entry = tk.Entry(root, width=50)
url_entry.pack()
url_entry.insert(0, default_url)
button_frame = tk.Frame(root)
button_frame.pack()
update_button = tk.Button(button_frame, text="タグ更新", command=update_tags)
update_button.pack(side=tk.LEFT)
select_all_button = tk.Button(button_frame, text="すべて選択", command=select_all_tags)
select_all_button.pack(side=tk.LEFT)
scrape_button = tk.Button(button_frame, text="スクレイピング実行", command=scrape_website)
scrape_button.pack(side=tk.LEFT)
tag_frame = tk.Frame(root)
tag_frame.pack()
tag_vars = {}
result_frame = tk.Frame(root)
result_frame.pack(fill="both", expand=True)
result_scrollbar = tk.Scrollbar(result_frame, orient="vertical")
result_text = tk.Text(result_frame, yscrollcommand=result_scrollbar.set, wrap="word", spacing3=10)
result_scrollbar.config(command=result_text.yview)
result_scrollbar.pack(side="right", fill="y", expand=False)
result_text.pack(side="left", fill="both", expand=True)
root.mainloop()