4.1 八卦版鄉民從哪來?

這隻爬蟲會去爬當前八卦版前50篇文章, 然後看這些發文的鄉民是來自哪個國家:

import requests
import time
import json
import re
from bs4 import BeautifulSoup


PTT_URL = 'https://www.ptt.cc'
FREEGEOIP_API = 'http://freegeoip.net/json/'


def get_web_page(url):
    resp = requests.get(url=url, cookies={'over18': '1'})
    if resp.status_code != 200:
        print('Invalid url: ', resp.url)
        return None
    else:
        return resp.text


def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html5lib')
    # Retrieve the link of previous page
    paging_div = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_div.find_all('a')[1]['href']

    articles = []
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        # If post date matched:
        if d.find('div', 'date').text.strip() == date:
            # To retrieve the push count:
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str)
                except ValueError:
                    # If transform failed, it might be '爆', 'X1', 'X2', etc.
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10

            # To retrieve title and href of the article:
            if d.find('a'):
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text if d.find('div', 'author') else ''
                articles.append({
                    'title': title,
                    'href': href,
                    'push_count': push_count,
                    'author': author
                })

    return articles, prev_url


def get_ip(dom):
    # e.g., ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 27.52.6.175
    pattern = '來自: \d+\.\d+\.\d+\.\d+'
    match = re.search(pattern, dom)
    if match:
        return match.group(0).replace('來自: ', '')
    else:
        return None


def get_country(ip):
    if ip:
        data = json.loads(requests.get(FREEGEOIP_API + ip).text)
        country_name = data['country_name'] if data['country_name'] else None
        return country_name
    return None


def main():
    print('取得今日文章列表:')
    current_page = get_web_page(PTT_URL + '/bbs/Gossiping/index.html')
    if current_page:
        articles = []
        today = time.strftime('%m/%d').lstrip('0')
        current_articles, prev_url = get_articles(current_page, today)
        while current_articles:
            articles += current_articles
            current_page = get_web_page(PTT_URL + prev_url)
            current_articles, prev_url = get_articles(current_page, today)
        print('共 %d 篇文章' % (len(articles)))

        print('取得前50篇文章的IP:')
        country_to_count = dict()
        for article in articles[:50]:
            print('查詢 IP:', article['title'])
            page = get_web_page(PTT_URL + article['href'])
            if page:
                ip = get_ip(page)
                country = get_country(ip)
                if country in country_to_count.keys():
                    country_to_count[country] += 1
                else:
                    country_to_count[country] = 1

        print('各國IP分佈: ')
        for k, v in country_to_count.items():
            print(k, v)


if __name__ == "__main__":
    main()

結果如下:

原始碼點我

關於國家的部分, 是把IP丟到freegeoip的api去查的, 詳細說明可以在官網查到: https://freegeoip.net

Last updated

Was this helpful?