Using libxml2 and python to scrape content from a website
by Mike on Feb.06, 2007, under Knowledge Base, Technology, Tutorials
This is a practical example, using Libxml2 to parse a real-world Web page (I chose the TV listings pages from the Guardian Website as it is the type of page you are likely to want to scrape for useful data.
Additionally, the Guardian TV listings contain a couple of very typical HTML errors. The listings are contained within a table, and some of the rows in the table are not closed.
The Libxml2 parser recovers from these errors by closing the tags at the end of the page and then continuing parsing from the next useable opening tag. This leaves us with a tr tag containing duplicated content from later in the document, which this code handles in a simple way by splitting the broken content on the next opening tag, and using another document instance to close the unterminated tags.
TV Listings
import libxml2, os, sys, datetime
# This script reads the TV listings from the Guardian TV listings website
# (http://www.guardian.co.uk/TV/)
parse_options = libxml2.HTML_PARSE_RECOVER + \
libxml2.HTML_PARSE_NOERROR + \
libxml2.HTML_PARSE_NOWARNING
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
class Channel:
def __init__(self, name):
self.name = name
self.entries = []
class ListingEntry:
def __init__(self,when):
self.when = when
self.title = ''
self.content = ''
def newTime(node, entries):
timeStr = str(node.content).strip()
# the Guardian listings show times like 6.00am or 5.45pm, we need to
# turn this into a more useable form. A Python datetime object will do
# just fine. It is also worth noting that the listings run from 6.00am
# to 6.00am, so we need to account for a date boundary at midnight.
if (timeStr.count('am') > 0) or (timeStr.count('pm') > 0):
t = timeStr.split('.')
hour = int(t[0])
minute = int(t[1][0:2])
ampm = t[1][2:4]
if (ampm == 'pm') and (hour < 12):
hour += 12
if (hour < 6):
date = tomorrow
else:
date = today
when = datetime.datetime(date.year, date.month, date.day, hour, minute)
newEntry = ListingEntry(when)
entries.append(newEntry)
def newProgramme(node, entries):
# luckily for us, all the Guardian TV entries are wrapped in <font> tags
# which is bad for accessibility, but gives us a known node to grab
items = node.xpathEval('.//font/node()')
for item in items:
if not item.isBlankNode():
if (item.type == 'text') or (item.type == 'element'):
if entries[-1].title == '':
entries[-1].title += str(item.content).strip()
else:
entries[-1].content += str(item.content).strip() + '\n'
def processSourceHTML(url,entries):
doc = libxml2.htmlReadFile(url, None, parse_options)
listingTable = doc.xpathEval('//table')[6]
rows = listingTable.xpathEval('.//tr')
for row in rows:
if len(row.xpathEval('.//tr')) > 0:
# This row is broken, tr tags should not contain more tr tags!
# it probably is missing one or more closing tags and therefore
# needs special handling.
fixup = row.serialize()
rows = fixup.split('<tr>')
# Here we load the broken HTML fragment into another documet
# to extract whatever we can from it.
fixDoc = libxml2.htmlReadDoc('<html>'+rows[1]+'</html>', \
'', None, parse_options)
cells = fixDoc.xpathEval('//td')
for cell in cells:
if cell.prev == None:
# if the cell has no previous sibling then it is the first
# cell in the row, e.g. the one containing the time
newTime(cell, entries)
else:
newProgramme(cell, entries)
fixDoc.freeDoc()
else:
cells = row.xpathEval('td')
for cell in cells:
if cell.prev == None:
# if the cell has no previous sibling then it is the first
# cell in the row, e.g. the one containing the time
newTime(cell, entries)
else:
newProgramme(cell, entries)
doc.freeDoc()
channels = []
# We could do more here from an automation perspective - spider the list
# of channels, automatically populating the channel names etc...
# but this is left as an exercise for the reader
channels.append( Channel('BBC1') )
processSourceHTML( \
'http://www.guardian.co.uk/TV/bbc1s_meridian.html', \
channels[-1].entries)
channels.append( Channel('BBC2') )
processSourceHTML( \
'http://www.guardian.co.uk/TV/bbc2s_meridian.html', \
channels[-1].entries)
channels.append( Channel('ITV - Meridian') )
processSourceHTML( \
'http://www.guardian.co.uk/TV/meridian_meridian.html', \
channels[-1].entries)
channels.append( Channel('Channel 4') )
processSourceHTML( \
'http://www.guardian.co.uk/TV/ch4_meridian.html', \
channels[-1].entries)
for channel in channels:
print channel.name
for entry in channel.entries:
print "----"
print entry.when
print entry.title
print "----"
print entry.content