使用lxml解析html文件
之前一直搞错了,实际上只需要使用lxml.html模块,读入html文件后,直接调用xpath定位所需要的内容就行了,不再需要etree什么的了。具体看下面的例子:
Python语言: 使用lxml解析html文件
01 #coding: utf-8
02
03 from lxml import *
04 import lxml.html as H
05 import urllib2
06
07 def getart(url):
08 f = urllib2.urlopen(url)
09 content = f.read()
10
11 art = []
12 doc = H.document_fromstring(content)
13 p = doc.xpath(‘/html/body/div[2]/div[2]/div/div/ul/li/div[2]/p’)
14 for i in p:
15 tmp = i.text_content()
16 tmp = tmp.encode(‘utf-8′)
17 tmp = tmp.replace(‘\r‘, ‘\n‘)
18 art.append(tmp)
19 return art
20
21 if __name__ == ‘__main__’:
22 urls = ['http://www.douban.com/group/topic/12018319/',
23 'http://www.douban.com/group/topic/12018319/?start=100']
24 outfile = open(‘zheda.txt’, ‘w’)
25 for url in urls:
26 art = getart(url)
27 for item in art:
28 outfile.write(item+‘\n‘)
29 outfile.close()
02
03 from lxml import *
04 import lxml.html as H
05 import urllib2
06
07 def getart(url):
08 f = urllib2.urlopen(url)
09 content = f.read()
10
11 art = []
12 doc = H.document_fromstring(content)
13 p = doc.xpath(‘/html/body/div[2]/div[2]/div/div/ul/li/div[2]/p’)
14 for i in p:
15 tmp = i.text_content()
16 tmp = tmp.encode(‘utf-8′)
17 tmp = tmp.replace(‘\r‘, ‘\n‘)
18 art.append(tmp)
19 return art
20
21 if __name__ == ‘__main__’:
22 urls = ['http://www.douban.com/group/topic/12018319/',
23 'http://www.douban.com/group/topic/12018319/?start=100']
24 outfile = open(‘zheda.txt’, ‘w’)
25 for url in urls:
26 art = getart(url)
27 for item in art:
28 outfile.write(item+‘\n‘)
29 outfile.close()
附:浙大夜惊魂6月26日更新至101章
hotfile: http://hotfile.com/dl/50704792/39ca85e/626101.txt.html
xun6: http://is.gd/d4Ym0
