最新消息: USBMI致力于为网友们分享Windows、安卓、IOS等主流手机系统相关的资讯以及评测、同时提供相关教程、应用、软件下载等服务。

python爬虫,爬取起点网站小说

IT圈 admin 0浏览 0评论

python爬虫,爬取起点网站小说

使用python再来做一次爬虫:主要抓取玄幻类型的小说
目标网址:起点
使用模块:bs4,os模块
基本思路:
获取需求页面的元素代码,装到bs4容器里面,然后进行操作

首先获取接口:,可以看到,亲求方法是get

首先获取玄幻小说的所有页面元素代码,然后装到bs4容器里进行操作:

url = ""
method = 'get'
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)","Referer":""}
res = requests.get(url,headers=headers)
res.encoding = 'utf-8'
# print(res.text)
soup = BeautifulSoup(res.text,'html.parser')
xuanhuan = soup.select('.book-list')
print('book-list:',xuanhuan)
number = 0

headers是对一些防爬机制的简单处理
因为有很多的页面和链接。所有建议把 BeautifulSoup直接封装:

from bs4 import BeautifulSoup
import requests
class soupx:def soup(self,method,url):headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "Referer": ""}res = requests.request(method,url,headers=headers)res.encoding = 'utf-8'soup = BeautifulSoup(res.text,'html.parser')return soup

完整代码块:

import os
from reptile.soup4 import soupx
import timepath = 'D:/xiaoshuo/'
#windows不能创建自带的目录,添加逻辑判断
if os.path.exists(path):print('目录已经存在')flag = 1
else:os.makedirs(path)flag = 0url = ""
method = 'get'
# headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)","Referer":""}
soup = soupx().soup(method=method,url=url)
#把bs操作模块封装成一个类,后面直接调用这个模块
# res = requests.get(url,headers=headers)
# res.encoding = 'utf-8'
# print(res.text)
# soup = BeautifulSoup(res.text,'html.parser')
xuanhuan = soup.select('.book-list')
print('book-list:',xuanhuan)
number = 0
for book in xuanhuan:#获取所有玄幻日周月前十的内容print('book:',book)soup1 = book.select('a')soup1.pop(1)soup1.pop(1)soup1.pop(1)number += 1print('soup1:',soup1)time.sleep(0.5)for article in soup1:#获取书名和链接print('article:',article)name = article.textherf = article['href']herf_article = 'https:' + herf  # 文章链接加上httpsprint(name,":",herf_article)file = os.path.join(path,name)print(file)# 获取章节链接article_soup = soupx().soup(method=method,url=herf_article)chapter = article_soup.select('.volume')print('chapter:',chapter)data_list = chapter[0]('li')# print(data_list)#打开或创建文件file_name = open(file + '.txt', 'w+', encoding='utf-8')article_soup = soupx().soup(method=method, url=herf_article)for data in data_list:# 获取章节名和内容chapter_href = "https:" + data.select('a')[0]['href']# print(article_href)soup = soupx().soup(method='get', url=chapter_href)# print(soup)chapter_name = soup.select('.content-wrap')[0].text# print(chapter_name)chapter_text = soup.select('.read-content')[0].text# print(chapter_text)file_name.writelines(chapter_name)print(chapter_name)file_name.writelines(chapter_text + '\n')time.sleep(0.5)file_name.close()

部分结果截图:

= = = = = = = = = = = = = = 分割线 = = = = = = = = = = = = = = = = = = =
这是爬取长沙公司信息的操作

import os
import time
import math
from reptile.soup4 import soupxpath = 'D:\\xiaoshuo\wql'soup = soupx().soup(method='get',url='/')
title = soup.select('.notop')
for i in title:sort_name = i.select('.le')[0].textli_sort = i.select('a')for a in li_sort:li_sort_href = a['href']li_sort_name = a.text# print(li_sort_name,':',li_sort_href)if os.path.exists(os.path.join(path,li_sort_name)):print('目录已经存在')flag = 1else:os.makedirs(os.path.join(path,li_sort_name))flag = 0path1 = os.path.join(path,li_sort_name)# print(path1)soup2 = soupx().soup('get',url=li_sort_href)li_soup2 = soup2.select('.listwords')[0]('a')# print(li_soup2)for c in li_soup2:soup2_herf = c['href']soup2_name = c.textprint(soup2_name,":",soup2_herf)soup3 = soupx().soup('get',url=soup2_herf)li_soup3 = soup3.select('.listwords')[0]('a')for z in li_soup3:soup3_herf = z['href']soup3_name = z.textprint(soup3_name, ":", soup3_herf)path2 = os.path.join(path1, soup3_name)os.makedirs(path2)soup4 = soupx().soup('get',url=soup3_herf)company_list = soup4.select('.com-item')number = soup4.select('.total')[0]('span')[0].textif int(number)/36 > 1:for company in company_list:company_name = company.select('a')[0].textcompany_href = company.select('a')[0]['href']# print(company_name,':',company_href)path3 = os.path.join(path2,company_name)company_message = soupx().soup('get', url=company_href)try:company_ph_href = company_message.select('.tab')[0]('a')[3]['href']company_phone = soupx().soup('get', url=company_ph_href)company_text = company_phone.select('.contact')[0].textexcept BaseException as e:print(e)else:file_name = open(path3 + '.txt', 'w+', encoding='utf-8')file_name.writelines(company_name)file_name.writelines(company_text + '\n')# print(soup4.select('.nextPage')[0]('a')[0]['href'])numb = math.ceil(int(number)/36)for num in range(int(numb)-1):company_hrefs = soup4.select('.nextPage')[0]('a')[0]['href']# print('这里:',company_hrefs)soup4_1 = soupx().soup('get',company_hrefs)company_list_1 = soup4_1.select('.com-item')for company in company_list_1:company_name = company.select('a')[0].textcompany_href = company.select('a')[0]['href']print(company_name,':',company_href)company_message = soupx().soup('get',url=company_href)path3 = os.path.join(path2, company_name)try:company_ph_href = company_message.select('.tab')[0]('a')[3]['href']company_phone = soupx().soup('get',url=company_ph_href)company_text = company_phone.select('.contact')[0].textexcept BaseException as e:print(a)else:file_name = open(path3 + '.txt', 'w+', encoding='utf-8')file_name.writelines(company_name)file_name.writelines(company_text + '\n')elif int(number)/36 <= 1:for company in company_list:company_name = company.select('a')[0].textcompany_href = company.select('a')[0]['href']# print(company_name,':',company_href)path3 = os.path.join(path2,company_name)company_message = soupx().soup('get', url=company_href)try:company_ph_href = company_message.select('.tab')[0]('a')[3]['href']company_phone = soupx().soup('get', url=company_ph_href)company_text = company_phone.select('.contact')[0].textexcept BaseException as e:print(e)else:file_name = open(path3 + '.txt', 'w+', encoding='utf-8')file_name.writelines(company_name)file_name.writelines(company_text + '\n')

python爬虫,爬取起点网站小说

使用python再来做一次爬虫:主要抓取玄幻类型的小说
目标网址:起点
使用模块:bs4,os模块
基本思路:
获取需求页面的元素代码,装到bs4容器里面,然后进行操作

首先获取接口:,可以看到,亲求方法是get

首先获取玄幻小说的所有页面元素代码,然后装到bs4容器里进行操作:

url = ""
method = 'get'
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)","Referer":""}
res = requests.get(url,headers=headers)
res.encoding = 'utf-8'
# print(res.text)
soup = BeautifulSoup(res.text,'html.parser')
xuanhuan = soup.select('.book-list')
print('book-list:',xuanhuan)
number = 0

headers是对一些防爬机制的简单处理
因为有很多的页面和链接。所有建议把 BeautifulSoup直接封装:

from bs4 import BeautifulSoup
import requests
class soupx:def soup(self,method,url):headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "Referer": ""}res = requests.request(method,url,headers=headers)res.encoding = 'utf-8'soup = BeautifulSoup(res.text,'html.parser')return soup

完整代码块:

import os
from reptile.soup4 import soupx
import timepath = 'D:/xiaoshuo/'
#windows不能创建自带的目录,添加逻辑判断
if os.path.exists(path):print('目录已经存在')flag = 1
else:os.makedirs(path)flag = 0url = ""
method = 'get'
# headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)","Referer":""}
soup = soupx().soup(method=method,url=url)
#把bs操作模块封装成一个类,后面直接调用这个模块
# res = requests.get(url,headers=headers)
# res.encoding = 'utf-8'
# print(res.text)
# soup = BeautifulSoup(res.text,'html.parser')
xuanhuan = soup.select('.book-list')
print('book-list:',xuanhuan)
number = 0
for book in xuanhuan:#获取所有玄幻日周月前十的内容print('book:',book)soup1 = book.select('a')soup1.pop(1)soup1.pop(1)soup1.pop(1)number += 1print('soup1:',soup1)time.sleep(0.5)for article in soup1:#获取书名和链接print('article:',article)name = article.textherf = article['href']herf_article = 'https:' + herf  # 文章链接加上httpsprint(name,":",herf_article)file = os.path.join(path,name)print(file)# 获取章节链接article_soup = soupx().soup(method=method,url=herf_article)chapter = article_soup.select('.volume')print('chapter:',chapter)data_list = chapter[0]('li')# print(data_list)#打开或创建文件file_name = open(file + '.txt', 'w+', encoding='utf-8')article_soup = soupx().soup(method=method, url=herf_article)for data in data_list:# 获取章节名和内容chapter_href = "https:" + data.select('a')[0]['href']# print(article_href)soup = soupx().soup(method='get', url=chapter_href)# print(soup)chapter_name = soup.select('.content-wrap')[0].text# print(chapter_name)chapter_text = soup.select('.read-content')[0].text# print(chapter_text)file_name.writelines(chapter_name)print(chapter_name)file_name.writelines(chapter_text + '\n')time.sleep(0.5)file_name.close()

部分结果截图:

= = = = = = = = = = = = = = 分割线 = = = = = = = = = = = = = = = = = = =
这是爬取长沙公司信息的操作

import os
import time
import math
from reptile.soup4 import soupxpath = 'D:\\xiaoshuo\wql'soup = soupx().soup(method='get',url='/')
title = soup.select('.notop')
for i in title:sort_name = i.select('.le')[0].textli_sort = i.select('a')for a in li_sort:li_sort_href = a['href']li_sort_name = a.text# print(li_sort_name,':',li_sort_href)if os.path.exists(os.path.join(path,li_sort_name)):print('目录已经存在')flag = 1else:os.makedirs(os.path.join(path,li_sort_name))flag = 0path1 = os.path.join(path,li_sort_name)# print(path1)soup2 = soupx().soup('get',url=li_sort_href)li_soup2 = soup2.select('.listwords')[0]('a')# print(li_soup2)for c in li_soup2:soup2_herf = c['href']soup2_name = c.textprint(soup2_name,":",soup2_herf)soup3 = soupx().soup('get',url=soup2_herf)li_soup3 = soup3.select('.listwords')[0]('a')for z in li_soup3:soup3_herf = z['href']soup3_name = z.textprint(soup3_name, ":", soup3_herf)path2 = os.path.join(path1, soup3_name)os.makedirs(path2)soup4 = soupx().soup('get',url=soup3_herf)company_list = soup4.select('.com-item')number = soup4.select('.total')[0]('span')[0].textif int(number)/36 > 1:for company in company_list:company_name = company.select('a')[0].textcompany_href = company.select('a')[0]['href']# print(company_name,':',company_href)path3 = os.path.join(path2,company_name)company_message = soupx().soup('get', url=company_href)try:company_ph_href = company_message.select('.tab')[0]('a')[3]['href']company_phone = soupx().soup('get', url=company_ph_href)company_text = company_phone.select('.contact')[0].textexcept BaseException as e:print(e)else:file_name = open(path3 + '.txt', 'w+', encoding='utf-8')file_name.writelines(company_name)file_name.writelines(company_text + '\n')# print(soup4.select('.nextPage')[0]('a')[0]['href'])numb = math.ceil(int(number)/36)for num in range(int(numb)-1):company_hrefs = soup4.select('.nextPage')[0]('a')[0]['href']# print('这里:',company_hrefs)soup4_1 = soupx().soup('get',company_hrefs)company_list_1 = soup4_1.select('.com-item')for company in company_list_1:company_name = company.select('a')[0].textcompany_href = company.select('a')[0]['href']print(company_name,':',company_href)company_message = soupx().soup('get',url=company_href)path3 = os.path.join(path2, company_name)try:company_ph_href = company_message.select('.tab')[0]('a')[3]['href']company_phone = soupx().soup('get',url=company_ph_href)company_text = company_phone.select('.contact')[0].textexcept BaseException as e:print(a)else:file_name = open(path3 + '.txt', 'w+', encoding='utf-8')file_name.writelines(company_name)file_name.writelines(company_text + '\n')elif int(number)/36 <= 1:for company in company_list:company_name = company.select('a')[0].textcompany_href = company.select('a')[0]['href']# print(company_name,':',company_href)path3 = os.path.join(path2,company_name)company_message = soupx().soup('get', url=company_href)try:company_ph_href = company_message.select('.tab')[0]('a')[3]['href']company_phone = soupx().soup('get', url=company_ph_href)company_text = company_phone.select('.contact')[0].textexcept BaseException as e:print(e)else:file_name = open(path3 + '.txt', 'w+', encoding='utf-8')file_name.writelines(company_name)file_name.writelines(company_text + '\n')

与本文相关的文章

发布评论

评论列表 (0)

  1. 暂无评论