首页 > 分享 > selenium 爬取淘宝宠物信息

selenium 爬取淘宝宠物信息

首先获得淘宝页面,并且输入查找信息,得到淘宝宠物的宝贝信息

首先要对浏览器对象进行初始化,

browser = webdriver.Chrome("D:/Program Files (x86)/Google/Chrome/Application/chromedriver")

因为我这里安装在D:/Program Files (x86)/Google/Chrome/Application/chromedriver,所以写入安装位置,如果不写有时候会报错,不调用chrome浏览器

def index_page():

"""

抓取索引页

:param page:

:return:

"""

try:

browser.get(url)

input=WebDriverWait(browser, 10).until(

EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))

)

input.send_keys(KeyWord)

input.send_keys(Keys.ENTER)

total=WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.total')))

return total.text

except TimeoutException:

return index_page()

接下来获得有多少个页面以后,就要实现如何获取下一页,具体代码实现如下,要对淘宝网页进行解析

def next_page(page):

try:

input = WebDriverWait(browser, 10).until(

EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))

)

submit = WebDriverWait(browser, 10).until(

EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))

)

get_product()

input.clear()

input.send_keys(page)

submit.click()

WebDriverWait(browser, 10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page)))

except TimeoutException:

return next_page(page)

能够循环获得页面以后就要对商品的信息进行提取,代码实现如下

def get_product():

WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist > div .items .item')))

html=browser.page_source

doc=pq(html)

items=doc('#mainsrp-itemlist .items .item').items()

for item in items:

product={

'image':item.find('.pic .img').attr('src'),

'price':item.find('strong').text(),

'deal-cnt':item.find('.deal-cnt').text()[:-2],

'location':item.find('.location').text(),

'J_ClickStat':item.find('.J_ClickStat').text(),

'shop':item.find('.shop span').siblings('span').text()

}

print(product)

save_mondb(product)

最后把信息保存到mongodb里,

def save_mondb(result):

try:

if db[MONGO_COLLECTION].insert(result):

print('suc')

except Exception:

print('f')

全部代码如下

from selenium import webdriver

import re

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from pyquery import PyQuery as pq

import pymongo

MONGO_URL="localhost"

MONGO_DB='taobao1'

MONGO_COLLECTION='product'

client=pymongo.MongoClient(MONGO_URL)

db=client[MONGO_DB]

browser = webdriver.Chrome("D:/Program Files (x86)/Google/Chrome/Application/chromedriver")

KeyWord='考研书籍全套2019'

url="https://www.taobao.com/"

def index_page():

"""

抓取索引页

:param page:

:return:

"""

try:

browser.get(url)

input=WebDriverWait(browser, 10).until(

EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))

)

input.send_keys(KeyWord)

input.send_keys(Keys.ENTER)

total=WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.total')))

return total.text

except TimeoutException:

return index_page()

def next_page(page):

try:

input = WebDriverWait(browser, 10).until(

EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))

)

submit = WebDriverWait(browser, 10).until(

EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))

)

get_product()

input.clear()

input.send_keys(page)

submit.click()

WebDriverWait(browser, 10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page)))

except TimeoutException:

return next_page(page)

def get_product():

WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist > div .items .item')))

html=browser.page_source

doc=pq(html)

items=doc('#mainsrp-itemlist .items .item').items()

for item in items:

product={

'image':item.find('.pic .img').attr('src'),

'price':item.find('strong').text(),

'deal-cnt':item.find('.deal-cnt').text()[:-2],

'location':item.find('.location').text(),

'J_ClickStat':item.find('.J_ClickStat').text(),

'shop':item.find('.shop span').siblings('span').text()

}

print(product)

save_mondb(product)

def save_mondb(result):

try:

if db[MONGO_COLLECTION].insert(result):

print('suc')

except Exception:

print('f')

def main():

page=index_page()

page=int(re.compile('(d+)').search(page).group(1))

print(page)

for i in range(2,page+1):

next_page(i)

if __name__ == '__main__':

main()

相关知识

Selenium WebDriver
爬虫实例:爬取宠物并可视化
养只爬虫当宠物(Node.js 爬虫爬取 58 同城租房信息)
淘宝宠物信息怎么修改呢
java宠物机器人的单词库被病毒感染了,第一种感染方式是括号内的内容没有被破坏,但是括号外面加入了一些奇怪数字, 例如“(Mary)”变成了“253(Mary)5”;
为你的爬宠取一个独特的名字!
淘宝怎么买宠物蛇
「淘宝宠物模特招聘信息」
淘宝怎么寄狗狗(淘宝怎么寄狗狗快递)
python爬虫,爬取百度图片

网址: selenium 爬取淘宝宠物信息 https://m.mcbbbk.com/newsview1027969.html

所属分类:萌宠日常
上一篇: 淘宝宠物
下一篇: 在淘宝网上购家养宠物可靠么?