python - how to automate this beautifulsoup import -
i importing links boxscores webpage
http://www.covers.com/pageloader/pageloader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html this how doing now. links first page.
url = 'http://www.covers.com/pageloader/pageloader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html' boxurl = urllib2.urlopen(url).read() soup = beautifulsoup(boxurl) boxscores = soup.findall('a', href=re.compile('boxscore')) basepath = "http://www.covers.com" pages=[] # grabs links page in boxscores: pages.append(urllib2.urlopen(basepath + a['href']).read()) then in new window this.
newsoup = pages[1] # manually changing every time soup = beautifulsoup(newsoup) def _unpack(row, kind='td'): return [val.text val in row.findall(kind)] tables = soup('table') linescore = tables[1] linescore_rows = linescore.findall('tr') roadteamq1 = float(_unpack(linescore_rows[1])[1]) roadteamq2 = float(_unpack(linescore_rows[1])[2]) roadteamq3 = float(_unpack(linescore_rows[1])[3]) roadteamq4 = float(_unpack(linescore_rows[1])[4]) # add ot rows if ??? roadteamfinal = float(_unpack(linescore_rows[1])[-3]) hometeamq1 = float(_unpack(linescore_rows[2])[1]) hometeamq2 = float(_unpack(linescore_rows[2])[2]) hometeamq3 = float(_unpack(linescore_rows[2])[3]) hometeamq4 = float(_unpack(linescore_rows[2])[4]) # add ot rows if ??? hometeamfinal = float(_unpack(linescore_rows[2])[-3]) misc_stats = tables[5] misc_stats_rows = misc_stats.findall('tr') roadteam = str(_unpack(misc_stats_rows[0])[0]).strip() hometeam = str(_unpack(misc_stats_rows[0])[1]).strip() datefinder = tables[6] datefinder_rows = datefinder.findall('tr') date = str(_unpack(datefinder_rows[0])[0]).strip() year = 2012 dateutil.parser import parse parseddate = parse(date) date = parseddate.replace(year) month = parseddate.month day = parseddate.day moddate = str(day)+str(month)+str(year) gameid = moddate + roadteam + hometeam data = {'roadteam': [roadteam], 'hometeam': [hometeam], 'roadq1': [roadteamq1], 'roadq2': [roadteamq2], 'roadq3': [roadteamq3], 'roadq4': [roadteamq4], 'homeq1': [hometeamq1], 'homeq2': [hometeamq2], 'homeq3': [hometeamq3], 'homeq4': [hometeamq4]} globals()["%s" % gameid] = pd.dataframe(data) df = pd.dataframe.load('df') df = pd.concat([df, globals()["%s" % gameid]]) df.save('df') how can automate don't have manually change newsoup = pages[1] manually , scrape of boxscores linked first url in 1 go. pretty new python , lacking in understanding of basics.
so in first code box collect pages
so in second code box have loop this, if understood it
for page in pages: soup = beautifulsoup(page) # rest of code here
Comments
Post a Comment