kong36088 / baiduimagespider Goto Github PK
View Code? Open in Web Editor NEW一个超级轻量的百度图片爬虫
License: MIT License
一个超级轻量的百度图片爬虫
License: MIT License
错误内容:UnboundLocalError: local variable 'page' referenced before assignment
<urlopen error unknown url type: socks5>
-----urlErrorurl: http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=%E5%9C%9F%E5%A3%A4&cg=girl&pn=0&rn=60&itg=0&z=0&fr=&width=&height=&lm=-1&ic=0&s=0&st=-1&gsm=1e0000001e
Traceback (most recent call last):
File "index.py", line 135, in <module>
crawler.start('土壤', 1, 1) # 抓取关键词为 “二次元 美女”,总数为 10 页(即总共 10*60=600 张),起始抓取的页码为 1
File "index.py", line 128, in start
self.get_images(word)
File "index.py", line 114, in get_images
page.close()
UnboundLocalError: local variable 'page' referenced before assignment
看起来失效了,用不起来
Hello
請問如果是要一般網站的下載圖片
要如何修改呢?
Thanks
爬1000多张就不行了
如题 非常感谢
学习了
鼓励下
爬到第120张图片的时候遇到了标题所示的报错,我用的代码是稍微改过的,改成用图片的原名称命名而非递增的序号:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import argparse
import os
import re
import sys
import urllib
import json
import socket
import urllib.request
import urllib.parse
import urllib.error
# 设置超时
import time
timeout = 5
socket.setdefaulttimeout(timeout)
class Crawler:
# 睡眠时长
__time_sleep = 0.1
__amount = 0
__start_amount = 0
__counter = 0
# 更多User-Agent见:http://tools.jb51.net/table/useragent
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
__per_page = 30
# 获取图片url内容等
# t 下载图片时间间隔
def __init__(self, t=0.1):
self.time_sleep = t
# 获取后缀名
@staticmethod
def get_suffix(name):
m = re.search(r'\.[^\.]*$', name)
if m.group(0) and len(m.group(0)) <= 5:
return m.group(0)
else:
return '.jpeg'
# 保存图片
def save_image(self, rsp_data, word):
if not os.path.exists("./" + word):
os.mkdir("./" + word)
# 判断名字是否重复,获取图片长度
self.__counter = len(os.listdir('./' + word)) + 1
for image_info in rsp_data['data']:
try:
if 'replaceUrl' not in image_info or len(image_info['replaceUrl']) < 1:
continue
obj_url = image_info['replaceUrl'][0]['ObjUrl']
thumb_url = image_info['thumbURL']
url = 'https://image.baidu.com/search/down?tn=download&ipn=dwnl&word=download&ie=utf8&fr=result&url=%s&thumburl=%s' % (urllib.parse.quote(obj_url), urllib.parse.quote(thumb_url))
time.sleep(self.time_sleep)
suffix = self.get_suffix(obj_url)
# 指定UA和referrer,减少403
opener = urllib.request.build_opener()
opener.addheaders = [
('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'),
]
urllib.request.install_opener(opener)
# 保存图片,word是你要从百度图片爬取的关键词,程序会新建一个以它为名的文件夹
image_name = url.split('/')[-1]
# print(f"image_name is {image_name}")
filepath = './%s/%s' % (word, image_name)
if os.path.exists(filepath):
print(f'此图片已存在{filepath},跳过下载')
continue
else:
urllib.request.urlretrieve(url, filepath) # 保存图片到本地
if os.path.getsize(filepath) < 5:
print("下载到了空文件,跳过!")
os.unlink(filepath)
continue
print("文件夹里已有" + str(self.__counter) + "张图片")
self.__counter += 1
except urllib.error.HTTPError as urllib_err:
print(urllib_err)
continue
except Exception as err:
time.sleep(1)
print(err)
print("产生未知错误,放弃保存")
continue
# else:
# print("文件夹里已有" + str(self.__counter) + "张图片")
# self.__counter += 1
return
# 开始获取
def get_images(self, word):
search = urllib.parse.quote(word)
# pn int 图片数
pn = self.__start_amount
while pn < self.__amount:
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%s&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word=%s&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=%s&rn=%d&gsm=1e&1594447993172=' % (search, search, str(pn), self.__per_page)
# 设置header防403
try:
time.sleep(self.time_sleep)
req = urllib.request.Request(url=url, headers=self.headers)
page = urllib.request.urlopen(req)
rsp = page.read()
page.close()
except UnicodeDecodeError as e:
print(e)
print('-----UnicodeDecodeErrorurl:', url)
except urllib.error.URLError as e:
print(e)
print("-----urlErrorurl:", url)
except socket.timeout as e:
print(e)
print("-----socket timout:", url)
else:
# 解析json
# rsp = rsp.decode().replace('\\', '\\\\')
rsp_data = json.loads(rsp)
self.save_image(rsp_data, word)
# 读取下一页
print("下载下一页")
pn += self.__per_page
print("下载任务结束")
return
def start(self, word, total_page=1, start_page=1, per_page=30):
"""
爬虫入口
:param word: 抓取的关键词
:param total_page: 需要抓取数据页数 总抓取图片数量为 页数 x per_page
:param start_page:起始页码
:param per_page: 每页数量
:return:
"""
self.__per_page = per_page
self.__start_amount = (start_page - 1) * self.__per_page
self.__amount = total_page * self.__per_page + self.__start_amount
self.get_images(word)
if __name__ == '__main__':
if len(sys.argv) > 1:
parser = argparse.ArgumentParser()
parser.add_argument("-w", "--word", type=str, help="抓取关键词", required=True)
parser.add_argument("-tp", "--total_page", type=int, help="需要抓取的总页数", required=True)
parser.add_argument("-sp", "--start_page", type=int, help="起始页数", required=True)
parser.add_argument("-pp", "--per_page", type=int, help="每页大小", choices=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100], default=30, nargs='?')
parser.add_argument("-d", "--delay", type=float, help="抓取延时(间隔)", default=0.05)
args = parser.parse_args()
crawler = Crawler(args.delay)
crawler.start(args.word, args.total_page, args.start_page, args.per_page) # 抓取关键词为 “美女”,总数为 1 页(即总共 1*60=60 张),开始页码为 2
else:
# 如果不指定参数,那么程序会按照下面进行执行
crawler = Crawler(0.05) # 抓取延迟为 0.05
crawler.start('警车', 10, 1, 30) # 抓取关键词为 “美女”,总数为 10 页,开始页码为 1,每页30张(即总共 10*30=300 张)
# crawler.start('二次元 美女', 10, 1) # 抓取关键词为 “二次元 美女”
# crawler.start('帅哥', 5) # 抓取关键词为 “帅哥”
usage: ipykernel_launcher.py [-h] -w WORD -tp TOTAL_PAGE -sp START_PAGE
[-pp [{10,20,30,40,50,60,70,80,90,100}]]
[-d DELAY]
ipykernel_launcher.py: error: the following arguments are required: -w/--word, -tp/--total_page, -sp/--start_page
An exception has occurred, use %tb to see the full traceback.
SystemExit: 2
这是为啥?
下载到了空文件,跳过!
下载到了空文件,跳过
下载到了空文件,跳过!
Remote end closed connection without response
产生未知错误,放弃保存
以上是问题,总是下载到了空文件,pc可以,但是termux进行就会这个问题,无root,请问怎么解决
大佬,还能搞一下吗
Hi, great jobs
when i check issue history, it seems you have resolved this issue, but when i try to run it, i also meet this issue.
could you check it. Thanks
作者您好,感谢您提供的脚本,我修改了crawler.start('消防车', 10, 1, 30)里的中间两个参数,似乎怎么修改下载的图片张数总是150张,您能指点一下吗?谢谢!
只能爬两页,第三页就开始触发反爬机制了咋办哥?
还可以用,很棒
My account was compromised, as a result many spam issues got created across multiple repos. I am deleting all such issues. Please check my tweet: https://x.com/arghyac35/status/1729721954909684064?s=20
我直接作者的运行 cmd
不停显示以下报错:
The read operation timed out
产生未知错误,放弃保存
百度图片怎么200张就403错误拒绝访问了
我只是想在百度上爬1w张图片,我有什么错?(doge)
为什么我把从百度图片返回的json数据中得到的图片网址放到浏览器中访问,显示403错误。
Traceback (most recent call last):
File "index.py", line 140, in
crawler.start('树',100,1)
File "index.py", line 131, in start
self.get_images(word)
File "index.py", line 111, in get_images
rsp_data = json.loads(rsp)
File "/home/ch/anaconda3/envs/py3/lib/python3.6/json/init.py", line 354, in loads
return _default_decoder.decode(s)
File "/home/ch/anaconda3/envs/py3/lib/python3.6/json/decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/home/ch/anaconda3/envs/py3/lib/python3.6/json/decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
我想多爬一点图片,为什么会在60张左右重复?
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.