2ch をスクレイピングする
スクレイピング対象ページのサンプル
http://ai.2ch.sc/test/read.cgi/newsplus/1613982935/
コードのサンプル
from bs4 import BeautifulSoup
import requests
import json
import sys
import re
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods.posts import GetPosts, NewPost
from wordpress_xmlrpc.methods.users import GetUserInfo
from datetime import datetime
class Scraping():
@classmethod
def get_comments(cls, target_url, target_indexes):
sys.setrecursionlimit(20000)
request = requests.get(target_url)
soup = BeautifulSoup(request.content, 'html.parser')
# dlのタグの中身を取得
text = soup.find("dl")
# 文字型に変換
text = str(text)
# 余計なタグを除去
text = text.replace("</dl>", "")
text = text.replace("</dd>", "")
text = text.replace("</dt>", "")
# <dt>ごとに配列に変換
texts = text.split("<dt>")
# ズレが出ないようにhtmlから投稿番号を抜き出して探す。
comments = ''
for text in texts:
index = re.search('^[0-9]*', text).group()
if index:
if int(index) in target_indexes:
text = f'<div style="padding: 10px; margin-bottom: 10px; border: 1px solid #333333; border-radius: 10px;">{text}</div>'
comments = comments + text
return comments
class File:
@classmethod
def get_target_url(cls):
lines = open('./target_url.txt', 'r')
for line in lines:
url = line
return url
@classmethod
def get_target_indexes(cls):
lines = open('./target_indexes.txt', 'r')
indexes = []
for line in lines:
line.replace('\n', '')
line = int(line)
indexes.append(line)
return indexes
@classmethod
def get_wordpress_info(cls):
wordpress_info = open('./wordpress_info.json', 'r')
wordpress_info = json.load(wordpress_info)
return wordpress_info
class WordPress:
@classmethod
def post_comments(cls, wordpress_info, comments):
wp = Client(wordpress_info['url']+'/xmlrpc.php', wordpress_info['user'], wordpress_info['password'])
post = WordPressPost()
post.title = "test"
post.content = comments
post.terms_names = {'category': ["test_category"]}
datetime.now()
post.post_status = 'publish'
wp.call(NewPost(post))
def main():
target_url = File.get_target_url()
target_indexes = File.get_target_indexes()
comments = Scraping.get_comments(target_url, target_indexes)
wordpress_info = File.get_wordpress_info()
WordPress.post_comments(wordpress_info, comments)
if __name__ == "__main__":
main()
ディスカッション
コメント一覧
まだ、コメントがありません