1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
|
import os import urllib.request from urllib.parse import quote
import openpyxl import win32com.client as win32 import xlwt from bs4 import BeautifulSoup from xlrd import open_workbook from xlutils.copy import copy import time
filename = "Excel_test1.xls" workbook = xlwt.Workbook() sheet = workbook.add_sheet(" ") workbook.save(filename)
def findISBN(s): findContent = "国际标准书号ISBN:" isbnStartPosition = s.rfind(findContent)+len(findContent) isbn = s[isbnStartPosition:isbnStartPosition+13] return isbn def findId(s): findContent = "http://product.dangdang.com/" idStartPosition = s.rfind(findContent)+len(findContent) idNumber = s[idStartPosition:-5] return idNumber def kore(keyword,ranges = 1, flag = False): ''' 按照关键词查找书籍信息的核心代码 ''' rexcel = open_workbook(filename) excel = copy(rexcel) if(flag):table = excel.get_sheet(" ") else:table = excel.add_sheet(keyword) excel.save(filename) sheet = table sheet.write(0,0,"id") sheet.write(0,1,"题目") sheet.write(0,2,"ISBN") sheet.write(0,3,"作者") sheet.write(0,4,"定价") sheet.write(0,5,"售价") sheet.write(0,6,"出版社") sheet.write(0,7,"类别") if(flag):count = rexcel.sheets()[0].nrows else:count = 1 icount = 1 for i in range(1, ranges+1): url = "http://search.dangdang.com/?key={}&act=input&page_index={}".format(quote(keyword, 'utf-8'), i)
f = urllib.request.urlopen(url) html = f.read().decode('gb18030') soup = BeautifulSoup(html, "html.parser")
title = soup.findAll(name="a", attrs={"name":"itemlist-title"}) author = [] ps = soup.findAll(name='p',attrs={"class":"search_book_author"}) for p in ps: author.append(p.a.string) pre_price = soup.findAll(name="span", attrs={'class':'search_pre_price'}) now_price = soup.findAll(name="span", attrs={'class':'search_now_price'}) publisher = soup.findAll(name="a", attrs={'name':'P_cbs'})
message = list(zip(title,author,pre_price,now_price,publisher))
ISBN = [] classies = [] idNumbers = [] loopStop = 0 for k in list(title): loopStop+=1 if(loopStop>=20): loopStop = 0 time.sleep(0.5) icount = icount + 1 childurl = k.get('href') idNumbers.append(findId(childurl)) ff = urllib.request.urlopen(childurl) bookImformationHtml = ff.read() bookSoup = BeautifulSoup(bookImformationHtml, "html.parser")
li = bookSoup.find(name="ul", attrs={"class":"key clearfix"}) tmp = findISBN(str(li)) ISBN.append(tmp)
classify = bookSoup.findAll(name = "a",attrs={"class":"green"}) tmpClass = [] for a_ in classify: if(a_.string != "图书"): tmpClass.append(a_.string) tmpClass.append(".") classies.append(tmpClass[:-1])
os.system('cls') print("关键词:",keyword) print("正在获取 : ",icount,"/ ", ranges*len(title)) print("获取链接: " + childurl) print("ID: " + findId(childurl)) print("书名: " + k['title']) print("国际索书号ISBN: " + tmp) print("分类: " + ("".join(tmpClass[:-1])).replace('.','>'))
allMessage = list(zip(title,author,pre_price,now_price,publisher, ISBN, classies, idNumbers))
iicount = 1 for item in allMessage: if(item[5].isdigit()): sheet.write(count,0, item[7]) sheet.write(count,1, item[0]['title']) sheet.write(count,2, item[5]) sheet.write(count,3, item[1]) sheet.write(count,4, item[2].string) sheet.write(count,5, item[3].string) sheet.write(count,6, item[4].string) sheet.write(count,7, item[6]) count = count + 1 os.system('cls') print("关键词:",keyword) print("正在写入缓存 : ",iicount," / ", len(allMessage)) iicount = iicount + 1 excel.save(filename) time.sleep(1)
def turn2XLSX(filename): fname = "D:\\Users\\Desktop\\IS_design\\"+filename excel = win32.gencache.EnsureDispatch('Excel.Application') wb = excel.Workbooks.Open(fname)
wb.SaveAs(fname+"x", FileFormat = 51) wb.Close() excel.Application.Quit()
def start(keyword, ranges = 1,flag = False): for k in keyword: kore(k, ranges,flag) os.system('cls') print("写入完成,正在转换文件格式") turnFileName = filename+'x' if os.path.exists(turnFileName): os.remove(turnFileName) turn2XLSX(filename) if(flag == False): deleteSheet(turnFileName," ")
def deleteSheet(sExcelFile, sheet): ''' 删除开始时候建的空sheet ''' wb = openpyxl.load_workbook(sExcelFile) ws = wb[sheet] wb.remove(ws) wb.save(sExcelFile) print("文件转换完成")
def main(): keyword = [r"东野圭吾",r"python",r"乙一",r"心里",r"陆秋槎",r"信息系统设计"] start(keyword,2 ,True) os.system('cls') print("已完成")
if __name__=="__main__": main()
|