本文共 6929 字,大约阅读时间需要 23 分钟。
'''01_普通代理示例.py'''import requestsurl = "http://www.baidu.com/"proxies = {"http":"http://183.129.207.82:11597"}headers = {"User-Agent":"Mozilla/5.0"}res = requests.get(url,proxies=proxies,headers=headers)print(res.status_code)
'''02_私密代理示例.py'''import requestsurl = "http://httpbin.org/get"headers = {"User-Agent":"Mozilla/5.0"}proxies = {"http":"http://309435365:szayclhp@123.206.119.108:16817"}res = requests.get(url,proxies=proxies,headers=headers)res.encoding = "utf-8"print(res.text)
'''05_链家数据ToMongo.py'''import requestsimport reimport pymysqlimport warningsclass LianjiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.page = 1 self.headers = {"User-Agent": "Mozilla/5.0"} self.proxies = {"http": "http://127.0.0.1:8888"} self.db = pymysql.connect("localhost", "root","ParisPython",charset="utf8") self.cursor = self.db.cursor() def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers,timeout=5) res.encoding = "utf-8" html = res.text print("页面爬取成功,正在解析...") self.parsePage(html) def parsePage(self,html): p = re.compile('(.*?).*?.*? (.*?)(.*?)',re.S) r_list = p.findall(html) # [("天通苑","480","万"),()..] print("页面解析完成,正在存入数据库...") self.writeTomysql(r_list) def writeTomysql(self,r_list): c_db = "create database if not exists Lianjiadb \ character set utf8" u_db = "use Lianjiadb" c_tab = "create table if not exists housePrice( \ id int primary key auto_increment,\ housename varchar(50), \ totalprice int)charset=utf8" warnings.filterwarnings("ignore") try: self.cursor.execute(c_db) self.cursor.execute(u_db) self.cursor.execute(c_tab) except Warning: pass ins = "insert into housePrice(housename,totalprice) \ values(%s,%s)" for r_tuple in r_list: name = r_tuple[0].strip() price = float(r_tuple[1].strip())*10000 L = [name,price] self.cursor.execute(ins,L) self.db.commit() print("存入数据库成功") def workOn(self): while True: c = input("爬取按y(q退出):") if c.strip().lower() == "y": url = self.baseurl + str(self.page) + "/" self.getPage(url) self.page += 1 else: self.cursor.close() self.db.close() print("爬取结束,谢谢使用!") breakif __name__ == "__main__": spider = LianjiaSpider() spider.workOn()
'''09_Web客户端验证.py'''import requestsimport reclass NoteSpider: def __init__(self): self.headers = {"User-Agent":"Mozilla/5.0"} self.url = "网址" self.proxies = {"http":"http://309435365:szayclhp@123.206.119.108:16817"} # auth参数存储用户名和密码(必须为元组) self.auth = ("账号","密码") def getParsePage(self): res = requests.get(self.url, proxies=self.proxies, headers=self.headers, auth=self.auth, timeout=3) res.encoding = "utf-8" html = res.text print(html) p = re.compile('
'''10_SSL证书认证示例.py'''import requestsurl = "https://www.12306.cn/mormhweb/"headers = {"User-Agent":"Mozilla/5.0"}res = requests.get(url,headers=headers,verify=False)res.encoding = "utf-8"print(res.text)
# 创建Handler处理器对象http_handler = urllib.request.HTTPHandler()#proxy_handler = urllib.request.ProxyHandler()# 创建自定义的opener对象opener = urllib.request.build_opener(http_handler)# 利用opener对象的open()方法发请求req = urllib.request.Request(url)res = opener.open(req)print(res.read().decode("utf-8"))
# 创建Handler处理器对象pro_hand = urllib.request.ProxyHandler(proxy)# 创建自定义opener对象opener = urllib.request.build_opener(pro_hand)# opener对象open方法发请求req = urllib.request.Request(url)res = opener.open(req)print(res.read().decode("utf-8"))
'''06_猫眼电影top100抓取.py'''import requestsimport reimport pymongoclass MaoyanSpider: def __init__(self): self.baseurl = "http://maoyan.com/board/4?offset=" self.headers = {"User-Agent":"Mozilla/5.0"} self.page = 1 self.offset = 0 self.proxies = {"http":"http://309435365:szayclhp@123.206.119.108:16817"} self.conn = pymongo.MongoClient("localhost",27017) self.db = self.conn.Film self.myset = self.db.top100 # 下载页面 def loadPage(self,url): res = requests.get(url,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 解析页面 def parsePage(self,html): p = re.compile('.*?title="(.*?)".*?(.*?)
.*?releasetime">(.*?) ',re.S) r_list = p.findall(html)# print(r_list) # [("霸王别姬","张国荣","1994-01-01"),(),()...] self.writeTomysql(r_list) def writeTomysql(self,r_list): for r_tuple in r_list: name = r_tuple[0].strip() star = r_tuple[1].strip() releasetime = r_tuple[2].strip() D = {"name":name, "star":star, "releasetime":releasetime} self.myset.insert(D) print("存入数据库成功") def workOn(self): while True: c = input("爬取请按y(y/n):") if c.strip().lower() == "y": self.offset = (self.page-1)*10 url = self.baseurl + str(self.offset) self.loadPage(url) self.page += 1 else: print("爬取结束,谢谢使用!") break if __name__ == "__main__": spider = MaoyanSpider() spider.workOn()
转载地址:http://bhdja.baihongyu.com/