from lxml import etree text = ''' <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签 </ul> </div> ''' html = etree.HTML(text) #将字符串解析为html文档 HTML 可以自动补全 li标签 body和html标签 print(html) #<Element html at 0x25be0d7adc8> print(type(html)) #<class 'lxml.etree._Element'> html = etree.tostring(html) #将html文档序列化为字节流字符串 bytes 类型的 print(html.decode()) #将字节流字符串解码为字符串
from lxmlimport etree
# 读取外部文件 hello.html
html = etree.parse('./hello.html')
#转换成string类型的html
result =etree.tostring(html)
#再转换成html文档
html = etree.HTML(result)
#最后转换成字节流html类型
result =etree.tostring(html)
#解码成utf-8编码
print(result.decode("utf-8"))
hello.html
<div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">haha<span class="bold">third item</span></a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>
xpath得到的都为列表,都需遍历
from lxml import etree html_obj = etree.parse('hello.html') result_list = html_obj.xpath('//li') #获取所有的li标签文档 for result in result_list: print(etree.tostring(result).decode()) #序列化li标签文档并解码为html字符串 result_list = html_obj.xpath('//li/@class') #获取所有li标签的class属性值 print(result_list) for result in result_list: print(result) result_list = html_obj.xpath('//li/a[@href="link1.html"]/text()') #获取li标签下href属性为link1.html的a标签的文本内容 # result_list = html_obj.xpath('//li/a[@href="link1.html"]') #获取li标签下href属性为link1.html的a标签 print(result_list) for result in result_list: print(result) print(etree.tostring(result).decode()) # result_list = html_obj.xpath('//li//span/text()') 获取li后代span标签的文本内容 result_list = html_obj.xpath('//li/a/span/text()') #获取li下a下的span标签的文本内容 # result_list = html_obj.xpath('//li/a/text()') 获取li下a标签的文本内容,不包a标签后代标签的文本内容 for result in result_list: print(result) result_list = html_obj.xpath('//li[last()]/a/@href') #获取最后一个li的a标签的href属性值 print(result_list) result_list = html_obj.xpath('//li[last()-1]/a') #获取最后一个的前一个li的a标签 print(result_list[0].text) result_list = html_obj.xpath('//li[last()-2]//text()') #获取最后一个的前一个li的文本内容以及他下面所有标签的文本内容,包括 print('dddd',result_list) result_list = html_obj.xpath('//*/@class') #获取所有标签的class属性值 for result in result_list: print(result) result_list = html_obj.xpath('//*[@class="bold"]/text()') #获取class属性值为bold的标签的文本内容 print(result_list) result_list = html_obj.xpath('//*[contains(@href,"html")]') #获取所有href属性值包含html的标签 for result in result_list: print(etree.tostring(result).decode())
爬取百度贴吧图片
import requests from lxml import etree import os def save_image(result): response = requests.get(result) if not os.path.exists('images'): os.makedirs('images') image_file_name = result[len(result)-10:] with open('images/'+image_file_name,'wb') as f: f.write(response.content) print('保存成功', image_file_name) def get_image_url(tieba_detail_url): response = requests.get(tieba_detail_url) html_obj = etree.HTML(response.content.decode()) response.text response.content 都行 result_list = html_obj.xpath('//div[@class="d_post_content j_d_post_content "]/img/@src') for result in result_list: save_image(result) def tieba_spider(kw, url, start_page, end_page): for page in range(start_page, end_page + 1): page = (page - 1) * 50 params = { 'pn': str(page), 'kw': kw } # headers = { # "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" # } response = requests.get(url, params=params) html = response.content.decode() print(response.url) html_obj = etree.HTML(html) result_list = html_obj.xpath('//div[@class="threadlist_title pull_left j_th_tit "]/a/@href') for result in result_list: tieba_detail_url = "https://tieba.baidu.com" + result get_image_url(tieba_detail_url) def main(): kw = input("请输入您要爬取贴吧名称:") start_page = int(input("请输入您要爬取起始页面:")) end_page = int(input("请输入您要爬取结束页面:")) url = "https://tieba.baidu.com/f?" + "&ie=utf-8" tieba_spider(kw, url, start_page, end_page) if __name__ == '__main__': main()
因篇幅问题不能全部显示,请点此查看更多更全内容