2ch をスクレイピングする

2021年7月17日2021年7月28日
スクレイピング対象ページのサンプル

http://ai.2ch.sc/test/read.cgi/newsplus/1613982935/
コードのサンプル


from bs4 import BeautifulSoup
import requests
import json
import sys
import re

from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods.posts import GetPosts, NewPost
from wordpress_xmlrpc.methods.users import GetUserInfo
from datetime import datetime

class Scraping():
    @classmethod
    def get_comments(cls, target_url, target_indexes):
        sys.setrecursionlimit(20000)
        request = requests.get(target_url)
        soup = BeautifulSoup(request.content, 'html.parser')

        # dlのタグの中身を取得
        text = soup.find("dl")

        # 文字型に変換
        text = str(text)

        # 余計なタグを除去
        text = text.replace("</dl>", "")
        text = text.replace("</dd>", "")
        text = text.replace("</dt>", "")

        # <dt>ごとに配列に変換
        texts = text.split("<dt>")

        # ズレが出ないようにhtmlから投稿番号を抜き出して探す。
        comments = ''
        for text in texts:
            index = re.search('^[0-9]*', text).group()
            if index:
                if int(index) in target_indexes:
                    text = f'<div style="padding: 10px; margin-bottom: 10px; border: 1px solid #333333; border-radius: 10px;">{text}</div>'
                    comments = comments + text
        return comments

class File:
    @classmethod
    def get_target_url(cls):
        lines = open('./target_url.txt', 'r')
        for line in lines:
            url = line
        return url

    @classmethod
    def get_target_indexes(cls):
        lines = open('./target_indexes.txt', 'r')
        indexes = []
        for line in lines:
            line.replace('\n', '')
            line = int(line)
            indexes.append(line)
        return indexes

    @classmethod
    def get_wordpress_info(cls):
        wordpress_info = open('./wordpress_info.json', 'r')
        wordpress_info = json.load(wordpress_info)
        return wordpress_info

class WordPress:
    @classmethod
    def post_comments(cls, wordpress_info, comments):
        wp = Client(wordpress_info['url']+'/xmlrpc.php', wordpress_info['user'], wordpress_info['password'])
        post = WordPressPost()

        post.title = "test"
        post.content = comments
        post.terms_names = {'category': ["test_category"]}
        datetime.now()
        post.post_status = 'publish'
        wp.call(NewPost(post))

def main():
    target_url = File.get_target_url()
    target_indexes = File.get_target_indexes()
    comments = Scraping.get_comments(target_url, target_indexes)
    wordpress_info = File.get_wordpress_info()
    WordPress.post_comments(wordpress_info, comments)

if __name__ == "__main__":
    main()