Python爬虫小说「python爬虫小说」
最近很多人关心Python爬虫小说「python爬虫小说」这个话题,卢子百科整理了Python爬虫小说「python爬虫小说」相关内容,希望对大家有用。
初学爬虫,学会了requests库和xpath的使用,通过这两个库来爬取一部小说来练练手。
目标:
爬取这部小说在本地建一个以小说名字命名的文件夹将小说的内容以文本形式(txt)保存到文件夹中#导入需要的库import requestsfrom lxml import etreeimport os#将目标网址找到url = 'https://www.kanunu8.com/book3/6879/index.html'headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68',}#获取小说名字def get_name(url):r = requests.get(url,headers=headers)r.encoding ='gb2312'tree = etree.HTML(r.text)title = tree.xpath('//tr[1]/td/h1/strong/font/text()')[0]return title#建立一个函数用来从首页找到每一章小说的链接,并返回小说链接def get_pageurl(url):r = requests.get(url,headers=headers)r.encoding ='gb2312'tree = etree.HTML(r.text)title = tree.xpath('//tr[1]/td/h1/strong/font/text()')[0]urls = tree.xpath('//tr[4]/td/table[2]/tbody/tr[@bgcolor="#ffffff"]/td/a/@href')print(title)links = []for i in urls:links.append('https://www.kanunu8.com/book3/6879/' i)return links#对单章的小说内容进行爬取,返回章节标题以及内容def get_content(url):r = requests.get(url,headers=headers)r.encoding = 'gb2312'tree = etree.HTML(r.text)title = tree.xpath('//tr[1]/td/strong/font/text()')[0]content = tree.xpath('//tr/td[2]/p/text()')content = ('n'.join([i.strip() for i in content]))return title,content#在本地建一个以小说名字命名的文件夹def mkfolder(folder_name):if not os.path.exists(folder_name):print('文件夹不存在,先创建文件夹')os.makedirs(folder_name)# 切换到文件件中os.chdir(folder_name)#将小说内容以文本形式保存在本地def save_to_file(title, content):print(title "开始保存!")with open(f'{title}.txt', 'w', encoding='utf-8')as f:f.write(content)print(title "保存成功!")#定义一个运行函数def run_spider():# 获取小说名name = get_name(url) # 列表转成字符串# 获取章节urlcatalogue = get_pageurl(url)path = f'./{name}'# 创建文件夹mkfolder(path)# 遍历它for i in catalogue:title, content = get_content(i)# 该章节的小说下载下来save_to_file(title, content)print('全部下载结束!')#运行if __name__ == '__main__':run_spider()
运行结果: