python 制作简易爬虫
以前一直想爬一些使用了cloudfare的网站,后来发现,难度还是太大了,使用知乎上的那些常规的手段完全不行,基本上在知乎爬虫教程里出现的网站都加大了反爬虫的力度,主要是一些招聘网站,现在爬取数据的难度还是比较大的。
准备工作
实现的代码
import requests
from bs4 import BeautifulSoup
import xlwt
import xlrd
from xlutils import copy
import time, datetime
BASE_URL = 'https://www.foxebook.net/page/{}/'
DETAIL_URL = 'https://www.foxebook.net{}'
EXCEL_BOOK = 'foxebook.xlsx'
ROW_INDEX = 0
NEW_EXCEL = None
# 下载图书封面
def download_bookimage(bookimage):
print('download book face')
# 获取书名
def get_bookname(book_detail):
return book_detail.find('h1').string
# 获取图书封面
def get_bookimage(book_detail):
return book_detail.find('img').get('src')
# 获取书籍发布时间
def get_publishdate(book_detail):
print(book_detail.select('.list-unstyled')[0].children)
return book_detail.find('.list-unstyled')
def get_book_detail(bookname):
detail_url = DETAIL_URL.format(bookname)
book_detail = requests.get(detail_url)
if book_detail.status_code == 200:
global NEW_EXCEL
booksheet = NEW_EXCEL.get_sheet(0)
global ROW_INDEX
print('详情页请求成功:{}'.format(bookname))
book_detail = BeautifulSoup(book_detail.text, 'html.parser')
bookname = get_bookname(book_detail)
print('bookname:'+bookname)
bookimage = get_bookimage(book_detail)
print('bookimage:'+bookimage)
bookdetails = book_detail.select('.list-unstyled')[0].find_all('li')
title = bookdetails[0].text.split(':')[1]
print('title:'+title)
author = bookdetails[1].text.split(':')[1]
print('author:'+author)
length = bookdetails[2].text.split(':')[1]
print('length:'+length)
edition = bookdetails[3].text.split(':')[1]
print('edition:'+edition)
language = bookdetails[4].text.split(':')[1]
print('language:'+language)
publisher = bookdetails[5].text.split(':')[1]
print('publisher:'+publisher)
publication_date = bookdetails[6].text.split(':')[1]
print('publication_date:'+publication_date)
ISBN_10 = bookdetails[7].text.split(':')[1]
print('ISBN_10:'+ISBN_10)
print(len(bookdetails))
if len(bookdetails) == 9:
ISBN_13 = bookdetails[8].text.split(':')[1]
print('ISBN_13:'+ISBN_13)
booksheet.write(ROW_INDEX, 8, ISBN_13)
booksheet.write(ROW_INDEX, 0, bookname)
booksheet.write(ROW_INDEX, 1, author)
booksheet.write(ROW_INDEX, 2, length)
booksheet.write(ROW_INDEX, 3, edition)
booksheet.write(ROW_INDEX, 4, language)
booksheet.write(ROW_INDEX, 5, publisher)
booksheet.write(ROW_INDEX, 6, publication_date)
booksheet.write(ROW_INDEX, 7, ISBN_10)
ROW_INDEX += 1
else:
print('书本:'+bookname+'页面请求失败'+'状态码为:'+str(book_detail.status_code))
def get_bookname_from_list(book_list):
for book in book_list:
bookname = book.select('a')[0].get('href')
get_book_detail(bookname)
# open excel
def open_excel():
wb = xlrd.open_workbook(filename=EXCEL_BOOK)
global NEW_EXCEL
NEW_EXCEL = copy.copy(wb)
# write column into excel
booksheet = NEW_EXCEL.get_sheet(0)
global ROW_INDEX
booksheet.write(ROW_INDEX, 0, 'bookname')
booksheet.write(ROW_INDEX, 1, 'author')
booksheet.write(ROW_INDEX, 2, 'length')
booksheet.write(ROW_INDEX, 3, 'edition')
booksheet.write(ROW_INDEX, 4, 'language')
booksheet.write(ROW_INDEX, 5, 'publisher')
booksheet.write(ROW_INDEX, 6, 'publication_date')
booksheet.write(ROW_INDEX, 7, 'ISBN_10')
booksheet.write(ROW_INDEX, 8, 'ISBN_13')
ROW_INDEX += 1
# close excel
def close_excel():
timestamp = time.strftime('%Y%m%d%H%M%S')
NEW_EXCEL.save('foxeboot{}.xls'.format(timestamp))
def get_page_by_number(page_number):
url = BASE_URL.format(page_number)
current_page = requests.get(url)
if current_page.status_code == 200:
soup = BeautifulSoup(current_page.text, 'html.parser')
book_list = soup.select('.thumbnail')
get_bookname_from_list(book_list)
else:
print('error on get list')
close_excel()
def link_start():
for i in range(2):
get_page_by_number(i)
if __name__ == '__main__':
open_excel()
link_start()
close_excel()
注意事项
全局变量的使用
最后更新于