長野マラソンの順位がなかなかでないので、ランナーズアップデートをスクレイピングして、集計してみた
結果はこちら
ソース
# coding:utf-8 import lxml.html import requests import urllib.request import bs4 import chardet import time f=open('nagano.csv','w') for num in range(12000): url = "http://update.runnet.jp/2017nagano/numberfile/{}.html".format(num) html = urllib.request.urlopen(url).read() soup = bs4.BeautifulSoup(html, "lxml") root = lxml.html.fromstring(str(soup)) try: name=root.cssselect('#personalBlock > dl:nth-child(1) > dd')[0].text_content() category=root.cssselect('#personalBlock > dl:nth-child(3) > dd')[0].text_content() name=name.replace(': ' , '') category=category.replace(': ' , '') start=root.cssselect('#mainBlock > table > tr > td:nth-child(2)')[0].text_content() goal=root.cssselect('#mainBlock > table > tr > td:nth-child(2)')[10].text_content() start_time = time.strptime(start, '%H:%M:%S') goal_time = time.strptime(goal, '%H:%M:%S') out = ','.join(map(str,(num, category, name, start, goal))) print(out) f.write(out+"\n") except: print(num, None, None)