今天开始学爬虫1
1.1:import urllib错误 module 'urllib' has no attribute 'request'
应该import urllib.request
import urllib.request url='http://www.baidu.com/' response=urllib.request.urlopen(url) content=response.read().decode('utf-8') print(content)2.1
#返回字节 content=response.read() #返回行 content=response.readline() content=response.readlines() #返回状态码 content=response.getcode() #返回url content=response.geturl() #返回状态信息 content=response.getheaders()2.2下载链接内容
url='' urllib.request.urlretrieve(url,filename)2.3 ua 请求对象定制
url="https://www.dianping.com/" header={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0', "Accept-Encoding": "identity"}#ua在检查-网络 中找到的 request=urllib.request.Request(url=url,headers=header)#将url和header包装成object response=urllib.request.urlopen(request) content=response.read().decode('utf-8')2.4 urllib.parse.quote将语言统一为unicode;get请求对象
import urllib.parse url1='https://www.baidu.com/s?wd=' name=urllib.parse.quote('搜索的内容') url=url1+name #将多个内容用&连接 base_url="https://www.baidu.com/s?" data={ 'wd':'主花', 'fandom':'Persona4' } new_data=urllib.parse.urlencode(data) url=base_url+new_data2.5 post请求对象(在 检查-网络-负载) 被拦截了百度和百度翻译
base_url="https://fanyi.baidu.com/sug" data={ 'kw':'formular' } new_data=urllib.parse.urlencode(data).encode('utf-8')#post请求需要将其编码成字节 header={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0', "Accept-Encoding": "identity"}#ua在检查-网络 中找到的 request=urllib.request.Request(url=url,data=new_data,headers=header)#将url和header以及不在url中显示的data包装成object response=urllib.request.urlopen(request) content=response.read().decode('utf-8') print(content)3.1将数据正则化方便加进headers
import re text = "name: 张三, age: 18, city: 北京" result = re.findall(r'\s*([^:,]+)\s*:\s*([^,]+)\s*', text) data = dict(result) print(data) #不用正则 text = "name:张三,age:18,city:北京" data = {} for item in text.split(","): key, value = item.split(":") data[key] = value print(data)3.2请求头是一行key,一行value
加上cookie就能请求成功了
但是gpt给了无需cookie的头同样成功了
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36 Edg/147.0.0.0",
"Referer": "https://fanyi.baidu.com/",
"Origin": "https://fanyi.baidu.com",
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "*/*",
"Accept-Encoding": "identity",
}
raw_headers = """ accept */* accept-encoding gzip, deflate, br, zstd accept-language zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6 connection keep-alive """ lines = [line.strip() for line in raw_headers.splitlines() if line.strip()] headers = {} for i in range(0, len(lines), 2): key = lines[i] value = lines[i + 1] headers[key] = value print(headers)