# 3.1 PTT八卦版今日熱門文章

PTT web版的html結構算是比較有規則的, 所以也是拿來練爬蟲的好對象, 下面這隻爬蟲的目的是要去找出今日的熱門文章(50推以上), 同時也會去找出今天有哪些5566發文了:

```python
import requests
import time
import json
from bs4 import BeautifulSoup


PTT_URL = 'https://www.ptt.cc'


def get_web_page(url):
    resp = requests.get(url=url, cookies={'over18': '1'})
    if resp.status_code != 200:
        print('Invalid url:', resp.url)
        return None
    else:
        return resp.text


def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html5lib')
    # Retrieve the link of previous page
    paging_div = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_div.find_all('a')[1]['href']

    articles = []
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        # If post date matched:
        if d.find('div', 'date').text.strip() == date:
            # To retrieve the push count:
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str)
                except ValueError:
                    # If transform failed, it might be '爆', 'X1', 'X2', etc.
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10

            # To retrieve title and href of the article:
            if d.find('a'):
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text if d.find('div', 'author') else ''
                articles.append({
                    'title': title,
                    'href': href,
                    'push_count': push_count,
                    'author': author
                })

    return articles, prev_url


def get_author_ids(posts, pattern):
    ids = set()
    for post in posts:
        if pattern in post['author']:
            ids.add(post['author'])
    return ids


def main():
    current_page = get_web_page(PTT_URL + '/bbs/Gossiping/index.html')
    if current_page:
        # To keep all of today's articles.
        articles = []
        # Today's date, here we remove the 0 at the head to match the format of PTT date.
        # API doc for strftime: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
        # API doc for lstrip: https://www.tutorialspoint.com/python/string_lstrip.htm
        today = time.strftime("%m/%d").lstrip('0')
        current_articles, prev_url = get_articles(current_page, today)

        while current_articles:
            articles += current_articles
            current_page = get_web_page(PTT_URL + prev_url)
            current_articles, prev_url = get_articles(current_page, today)

        print("Today's 5566:")
        print(get_author_ids(articles, '5566'))

        print('\nThere are ', len(articles), ' posts today.')
        threshold = 50
        print('Hot post(≥ %d push): ' % threshold)
        for article in articles:
            if int(article['push_count']) > threshold:
                print(article)
        # with as: https://openhome.cc/Gossip/Python/WithAs.html
        # json.dump: http://python3-cookbook.readthedocs.io/zh_CN/latest/c06/p02_read-write_json_data.html
        with open('gossiping.json', 'w', encoding='UTF-8') as file:
            json.dump(articles, file, indent=2, sort_keys=True, ensure_ascii=False)


if __name__ == '__main__':
    main()
```

輸出如下:

```
Today's 5566:
{'Kyle5566', 'umaru5566', 'LOLI5566', 'boring5566', 'ocean5566', 'rocky5566', 'thor5566', 'N5566', 'Shiau5566', 'Ommmmmm5566', 'XDDDD5566', 'eric1835566', 'glory5566', 'kasumi5566', 'oppo5566', 'sai5566', 'KMT5566', 'pusufu5566', 'GalLe5566', 'great5566', 'youtu5566', 'Mars5566', 'Laplace5566', 'CS5566', 'kenq5566', 'fu885566', 'jailkobe5566', 'bigloser5566', 'ncc5566', 'ceiba5566', 'Tiara5566', 'OAO5566', 'tigotigo5566', 'LBJisGod5566', 'yayaya5566', 'Davichi5566', 'Poke5566', 'Racing5566', 'stock5566', 'hddd5566', 'FL5566', 'aass5566'}

There are  1318  posts today.
Hot post(≥ 50 push): 
{'title': '[問卦] 不要再說甲甲是HIV高帶原族群了？', 'href': '/bbs/Gossiping/M.1495865102.A.6A4.html', 'push_count': 97, 'author': 'sa99'}
{'title': '[爆卦] 柯潔認輸了', 'href': '/bbs/Gossiping/M.1495865379.A.45B.html', 'push_count': 58, 'author': 'theHum'}
{'title': '[新聞] 中國遊客實拍\u3000「台灣和柬埔寨很像」', 'href': '/bbs/Gossiping/M.1495864115.A.F94.html', 'push_count': 99, 'author': 'stw0975'}
{'title': '[新聞] 指甲撞斷求償65萬 女攝影師「因我是職業的', 'href': '/bbs/Gossiping/M.1495863288.A.EE7.html', 'push_count': 65, 'author': 'WorkinChina'}
{'title': '[新聞] 日本街頭警匪追逐 網友傻眼：在慢跑嗎？', 'href': '/bbs/Gossiping/M.1495861344.A.311.html', 'push_count': 93, 'author': 'clark2644'}
{'title': 'Re: [新聞] 搶你飯碗？「白領外勞法案」下周審查\u3000林', 'href': '/bbs/Gossiping/M.1495859846.A.E3D.html', 'push_count': 99, 'author': 'SUPER22K'}
{'title': '[新聞] 揪甘心！資助倪敏然一家 康康一口氣開12', 'href': '/bbs/Gossiping/M.1495859235.A.90C.html', 'push_count': 55, 'author': 'Salcea'}
{'title': 'Re: [問卦] 林奕含事件超長懶人包', 'href': '/bbs/Gossiping/M.1495857192.A.2CD.html', 'push_count': 99, 'author': 'skysucker'}
{'title': '[新聞] 搶你飯碗？「白領外勞法案」下周審查\u3000林', 'href': '/bbs/Gossiping/M.1495856472.A.A7C.html', 'push_count': 84, 'author': 'caserchen'}
{'title': 'Re: [問卦] 取什麼id能約到炮啊？', 'href': '/bbs/Gossiping/M.1495855868.A.A89.html', 'push_count': 99, 'author': 'rrr518'}
{'title': '[問卦] 哪部漫畫完全不拖戲 也沒有爛尾??', 'href': '/bbs/Gossiping/M.1495856063.A.30D.html', 'push_count': 99, 'author': 'yahe0526'}
{'title': '[問卦] PTT到底哪一版才是風向最一致最祥和的???', 'href': '/bbs/Gossiping/M.1495854477.A.FC1.html', 'push_count': 80, 'author': 'bmw606042001'}
{'title': 'Re: [問卦] 偽物約到炮了!!!!!!!! rrrrrrrrrrrrr', 'href': '/bbs/Gossiping/M.1495853724.A.430.html', 'push_count': 99, 'author': 'rrr518'}
{'title': '[新聞] 北市榮總驚傳車輛墜樓\u3000疑倒車不慎釀禍', 'href': '/bbs/Gossiping/M.1495852318.A.B7F.html', 'push_count': 62, 'author': 'was7575'}
{'title': 'Re: [爆卦] 世大運臉書未經同意盜用廣告平面肖像權', 'href': '/bbs/Gossiping/M.1495851938.A.848.html', 'push_count': 81, 'author': 'pinkgogo'}
{'title': '[爆卦] 換個名稱，多切一刀，貴七元', 'href': '/bbs/Gossiping/M.1495851460.A.C4C.html', 'push_count': 63, 'author': 'amose999'}
{'title': '[新聞] 家屬赴美求援惹惱中國？綠黨政高層：李明', 'href': '/bbs/Gossiping/M.1495849744.A.310.html', 'push_count': 64, 'author': 'jailkobe5566'}
{'title': '[新聞] 民進黨偷渡低薪白領外勞法案？林淑芬籲', 'href': '/bbs/Gossiping/M.1495848480.A.EBA.html', 'push_count': 99, 'author': 'Siegfried921'}
{'title': 'Re: [爆卦] 2014年的 ooutputt(小杯) 就是林奕含本人', 'href': '/bbs/Gossiping/M.1495842416.A.AA3.html', 'push_count': 73, 'author': 'JCS15'}
{'title': '[新聞] 掃毒電影是真的！\u3000高雄破獲693公斤海洛', 'href': '/bbs/Gossiping/M.1495842136.A.2E0.html', 'push_count': 54, 'author': 'HANASUCIA'}
{'title': 'Re: [問卦] 死掉的小三是不是比較受同情？', 'href': '/bbs/Gossiping/M.1495836445.A.461.html', 'push_count': 51, 'author': 'takenostand'}
{'title': '[新聞] 被螃蟹誤當生蠔\u3000女沙灘裸曬陰部被夾傷', 'href': '/bbs/Gossiping/M.1495834367.A.62B.html', 'push_count': 99, 'author': 'YU0487'}
{'title': '[問卦] 貓咪怎麼晚不睡怎麼辦', 'href': '/bbs/Gossiping/M.1495826048.A.220.html', 'push_count': 99, 'author': 'pupss89177'}
{'title': '[問卦] 年薪1000萬但要做篩選噁心圖片工作願意嗎', 'href': '/bbs/Gossiping/M.1495823857.A.78C.html', 'push_count': 56, 'author': 'MrSatan'}
{'title': 'Re: [問卦] 甲甲是不是很容易彼此看對眼阿？', 'href': '/bbs/Gossiping/M.1495820604.A.013.html', 'push_count': 99, 'author': 'aynmeow'}
{'title': '[問卦] 低薪、高房價..到底還要多久啊？', 'href': '/bbs/Gossiping/M.1495820926.A.165.html', 'push_count': 82, 'author': 'saufu08'}
{'title': 'Re: [問卦] 無證據定罪陳星為何看大家似乎很爽?', 'href': '/bbs/Gossiping/M.1495821021.A.358.html', 'push_count': 52, 'author': 'taxuan'}
{'title': '[問卦] 林奕含事件超長懶人包', 'href': '/bbs/Gossiping/M.1495816752.A.03B.html', 'push_count': 99, 'author': 'PlusSign'}
{'title': '[新聞] 老婆跟人跑\u3000企鵝愛上動漫女主角', 'href': '/bbs/Gossiping/M.1495816352.A.563.html', 'push_count': 99, 'author': 'your135'}
{'title': 'Re: [問卦] 被遊俠海怎麼辦', 'href': '/bbs/Gossiping/M.1495816268.A.689.html', 'push_count': 99, 'author': 'Wtaa'}
{'title': '[新聞] 原民語言法三讀 總統：每一族語都是國家', 'href': '/bbs/Gossiping/M.1495814878.A.453.html', 'push_count': 54, 'author': 'GodGeass'}

Process finished with exit code 0
```

~~不得不說, 有些鄉民的發文真的是讓人看了就搖頭...~~

原始碼[點我](https://github.com/yotsuba1022/web-crawler-practice/blob/master/ch3/ptt_gossiping.py)


---

# Agent Instructions: Querying This Documentation

If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://clu.gitbook.io/python-web-crawler-note/31-pttba-gua-ban-jin-ri-re-men-wen-zhang.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
