1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
| dimport requests import pymongo import csv from datetime import datetime
class spa1: def __init__(self): self.url = 'https://spa1.scrape.center/api/movie/?limit=10&offset={}' self.headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0' } self.all_movies = [] self.filename = 'moves.csv' self.client = pymongo.MongoClient(host='localhost', port=27017) self.db = self.client['spa1']['movies']
def url_update(self): for page in range(0,100): url = self.url.format(page) print(f"现在正在获取此:{url}") try: json_info = requests.get(url, headers=self.headers, timeout=5).json() if not json_info['results']: print(f"第{page}页已经没有数据,停止爬取") break self.moves_data(json_info['results']) except Exception as e: print(f"请求失败:{url}, 错误信息:{e}") rum = 3 while rum > 0: try: json_info = requests.get(url, headers=self.headers, timeout=5).json() self.moves_data(json_info['results']) break except: rum -= 1 continue
def moves_data(self, json_info): if not isinstance(json_info, list): print(f"获取的数据不是列表类型,请检查数据类型") return for move in json_info: move_data = { 'ID': move.get('id'), 'NAME': move.get('name'), 'ALIAS': move.get('alias'), 'COVER': move.get('cover'), 'CATEGORIES': move.get('categories'), 'PUBLISHED_AT': move.get('published_at'), 'MINUTE': move.get('minute'), 'SCORE': move.get('score'), 'REGIONS': move.get('regions') } self.all_movies.append(move_data) print(f"已收集电影数据:{len(self.all_movies)}")
def save_info_cvs(self): if not self.all_movies: print("没有电影数据") return else: print('列表中存在数据,可以开始保存了!!!')
times = datetime.now().strftime('%Y%m%d_%H%M%S') try: headers = ['ID','NAME','ALIAS','COVER','CATEGORIES','PUBLISHED_AT','MINUTE','SCORE','REGIONS'] with open(self.filename,'w',encoding='utf-8',newline='') as f: writer = csv.DictWriter(f,fieldnames=headers) if f.tell() == 0: writer.writeheader() for move_data in self.all_movies: writer.writerow(move_data) print(f"已保存电影:{move_data['NAME']}") except Exception as e: print(f"保存文件时错误,错误信息:{e}")
def save_info_mongo(self): if not self.all_movies: print('无电影数据可以保存!!!') return else: print('列表中存在数据,可以开始保存了!!!') try: result = self.db.insert_many(self.all_movies) print(f"已保存{len(result.inserted_ids)}条数据")
self.db.create_index([('ID', pymongo.ASCENDING)]) print("创建索引成功") except Exception as e: print(f"保存数据到MongoDB时发生错误,错误信息:{e}")
if __name__ == "__main__": spider=spa1() spider.url_update() spider.save_info_mongo()
|