Using libxml2 and python to scrape content from a website

by on Feb.06, 2007, under Knowledge Base, Technology, Tutorials

This is a practical example, using Libxml2 to parse a real-world Web page (I chose the TV listings pages from the Guardian Website as it is the type of page you are likely to want to scrape for useful data.

Additionally, the Guardian TV listings contain a couple of very typical HTML errors. The listings are contained within a table, and some of the rows in the table are not closed.

The Libxml2 parser recovers from these errors by closing the tags at the end of the page and then continuing parsing from the next useable opening tag. This leaves us with a tr tag containing duplicated content from later in the document, which this code handles in a simple way by splitting the broken content on the next opening tag, and using another document instance to close the unterminated tags.

TV Listings

import libxml2, os, sys, datetime

# This script reads the TV listings from the Guardian TV listings website
# (http://www.guardian.co.uk/TV/)

parse_options = libxml2.HTML_PARSE_RECOVER + \
	libxml2.HTML_PARSE_NOERROR + \
	libxml2.HTML_PARSE_NOWARNING

today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)

class Channel:
	def __init__(self, name):
		self.name = name
		self.entries = []

class ListingEntry:
	def __init__(self,when):
		self.when = when
		self.title = ''
		self.content = ''

def newTime(node, entries):
	timeStr = str(node.content).strip()

	# the Guardian listings show times like 6.00am or 5.45pm, we need to
	# turn this into a more useable form. A Python datetime object will do
	# just fine. It is also worth noting that the listings run from 6.00am
	# to 6.00am, so we need to account for a date boundary at midnight.
	if (timeStr.count('am') > 0) or (timeStr.count('pm') > 0):
		t = timeStr.split('.')
		hour = int(t[0])
		minute = int(t[1][0:2])
		ampm = t[1][2:4]

		if (ampm == 'pm') and (hour < 12):
			hour += 12

		if (hour < 6):
			date = tomorrow
		else:
			date = today

		when = datetime.datetime(date.year, date.month, date.day, hour, minute)
		newEntry = ListingEntry(when)
		entries.append(newEntry)

def newProgramme(node, entries):
	# luckily for us, all the Guardian TV entries are wrapped in <font> tags
	# which is bad for accessibility, but gives us a known node to grab
	items = node.xpathEval('.//font/node()')
	for item in items:
		if not item.isBlankNode():
			if (item.type == 'text') or (item.type == 'element'):
				if entries[-1].title == '':
					entries[-1].title += str(item.content).strip()
				else:
					entries[-1].content += str(item.content).strip() + '\n'

def processSourceHTML(url,entries):
	doc = libxml2.htmlReadFile(url, None, parse_options)
	listingTable = doc.xpathEval('//table')[6]
	rows = listingTable.xpathEval('.//tr')
	for row in rows:
		if len(row.xpathEval('.//tr')) > 0:
			# This row is broken, tr tags should not contain more tr tags!
			# it probably is missing one or more closing tags and therefore
			# needs special handling.
			fixup = row.serialize()

			rows = fixup.split('<tr>')

			# Here we load the broken HTML fragment into another documet
			# to extract whatever we can from it.
			fixDoc = libxml2.htmlReadDoc('<html>'+rows[1]+'</html>', \
				'', None, parse_options)

			cells = fixDoc.xpathEval('//td')
			for cell in cells:
				if cell.prev == None:
					# if the cell has no previous sibling then it is the first
					# cell in the row, e.g. the one containing the time
					newTime(cell, entries)
				else:
					newProgramme(cell, entries)

			fixDoc.freeDoc()
		else:
			cells = row.xpathEval('td')
			for cell in cells:
				if cell.prev == None:
					# if the cell has no previous sibling then it is the first
					# cell in the row, e.g. the one containing the time
					newTime(cell, entries)
				else:
					newProgramme(cell, entries)
	doc.freeDoc()

channels = []

# We could do more here from an automation perspective - spider the list
# of channels, automatically populating the channel names etc...
# but this is left as an exercise for the reader
channels.append( Channel('BBC1') )
processSourceHTML( \
	'http://www.guardian.co.uk/TV/bbc1s_meridian.html', \
	channels[-1].entries)

channels.append( Channel('BBC2') )
processSourceHTML( \
	'http://www.guardian.co.uk/TV/bbc2s_meridian.html', \
	channels[-1].entries)

channels.append( Channel('ITV - Meridian') )
processSourceHTML( \
	'http://www.guardian.co.uk/TV/meridian_meridian.html', \
	channels[-1].entries)

channels.append( Channel('Channel 4') )
processSourceHTML( \
	'http://www.guardian.co.uk/TV/ch4_meridian.html', \
	channels[-1].entries)

for channel in channels:
	print channel.name
	for entry in channel.entries:
		print "----"
		print entry.when
		print entry.title
		print "----"
		print entry.content
:, , , ,

Leave a Reply

Looking for something?

Use the form below to search the site:

Still not finding what you're looking for? Drop a comment on a post or contact us so we can take care of it!