1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
| import requests from bs4 import BeautifulSoup
def get_html(url): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return "ERR22OR"
def get_content(url): url_list=[] html=get_html(url) soup=BeautifulSoup(html,'lxml') category_list=soup.find_all('div',attrs={'class':'index_toplist mright mbottom'}) history_finished_list=soup.find_all('div',attrs={'class':'index_toplist mbottom'}) for cate in category_list: i=0 name=cate.find('div',class_='toptab').span.string with open('E:/novel_list.csv','a+') as f: f.write("\n小说种类:{} \n".format(name)) print("小说种类:%s"%name)
general_list=cate.find(style="display: block;") book_list=general_list.find_all('li') for book in book_list: i=i+1 link='http://www.qu.la'+book.a['href'] title=book.a['title'] title=book.a['title'] url_list.append(link) with open('E:/novel_list.csv','a') as f: f.write("小说名:{:<} \t 小说地址:{:<} \n".format(title,link)) print("第%d名小说名:%s\t小说地址:%s"%(i,title,link)) for cate in history_finished_list: i=0 name = cate.find('div',attrs={'class':'toptab'}).span.string with open('E:/novel_list.csv', 'a') as f: f.write("\n小说种类:{} \n".format(name)) print("小说种类:%s" % name)
general_list = cate.find(style='display: block;') book_list = general_list.find_all('li') for book in book_list: i=i+1 link = 'http://www.qu.la' + book.a['href'] title = book.a['title'] url_list.append(link) with open('E:/novel_list.csv', 'a') as f: f.write("小说名:{:<} \t 小说地址:{:<} \n".format(title, link)) print("第%d名小说名:%s\t小说地址:%s"%(i,title,link))
return url_list
def get_txt_url(url): url_list = [] html=get_html(url) soup=BeautifulSoup(html,'lxml') lista=soup.find_all('dd') txt_name=soup.find('h1').text with open('E:/xiaoshuo/{}.txt'.format(txt_name),'a')as f: f.write('小说标题:{} \n'.format(txt_name)) print("小说标题%s\n"%txt_name) for url in lista: title_name=url.a.string url_list.append('http://www.qu.la'+url.a['href']) with open('E:/xiaoshuo/{}.txt'.format(txt_name), 'a')as f: f.write('%s 链接:http://www.qu.la%s \n'%(title_name,url.a['href'])) print('%s 链接:http://www.qu.la%s'%(title_name,url.a['href']))
return url_list,txt_name
def get_one_txt(url): html=get_html(url).replace('<br>','\n') soup=BeautifulSoup(html,'lxml')
txt=soup.find('div',id='content').text.replace('chaptererror();','') title=soup.find('title').text with open('E:/xiaoshuo/{}.txt'.format(title),'a',encoding='utf-8') as f: f.write('\t\t\t\t\t\t'+title+'\n\n') f.write(txt) print(txt) print('当前章节{} 已经下载完毕'.format(title))
url1='https://www.qu.la/paihangbang/' url2=input("请输入需要爬取章节的小说网址:") url3=str(input("请输入需要爬取内容的章节网址:")) if __name__ == '__main__': get_content(url1) get_txt_url(url2) get_one_txt(url3)
|