import requests
import re
import pandas as pdhost = r'https://am.22.cn'
url = 'https://am.22.cn/wsp/History/Detail/'mainpatt = re.compile(r'tbody[\s\S]+?</tbody>')
rowpatt = re.compile(r'<tr>[\S\s]+?</tr>')
cellpatt = re.compile(r'<td[\s\S]+?>([\s\S]+?)</td>')
domainurlpatt = re.compile(r'href="([\s\S]+?)"')
domainpatt = re.compile(r'_blank">([\s\S]+?)</a>')
titlepatt = re.compile(r'<title>([\s\S]+?)</title>')def getinner(url):bb = requests.get(url).content.decode('utf8')c = titlepatt.findall(bb)[0]return cdef getsingle(url):aa = requests.get(url=url).content.decode('utf8')main = mainpatt.findall(aa)[0]rows = rowpatt.findall(main)aa = []for each in rows:cells = cellpatt.findall(each)domainstr = cells[1]price = cells[3].strip().replace('<td>', '').replace('¥', '')status = cells[4].strip().replace('</font>', '')domainurl = domainurlpatt.findall(domainstr)[0].strip()domain = domainpatt.findall(domainstr)[0].strip()if domain.find("打包") >=0:tmpurl = host + domainurldomain = getinner(tmpurl)d = {'domain': domain, 'price': price, 'status': status}aa.append(d)return aaif __name__ == '__main__':r = []for i in range(1000, 2232):turl = r'https://am.22.cn/wsp/History/Detail/' + str(i)a = getsingle(turl)r = r + aprint(i, 'done')df = pd.DataFrame(r)df.to_excel('1.xlsx', index=False)
