我们从Python开源项目中,提取了以下30个代码示例,用于说明如何使用scrapy.http.FormRequest()。
def parse_data(self, response): item = AfscrapyItem() datas = json.loads(response.body.decode('utf-8')) for data in datas['msg']: item['goods_id'] = data['id'] item['shop_name'] = "??" item['category_name'] = response.meta["cat"] item['title'] = data['product_name'] item['sales_num'] = 0 item['unit'] = data['volume'] item['price'] = data['price'] item['location'] = "" yield item next_page = int(response.meta['page']) + 1 yield FormRequest(response.url, formdata={"class_id": response.meta["class_id"], "curr_page": str(next_page)}, callback=self.parse_data, meta={"cat": response.meta["cat"], "class_id": response.meta["class_id"], "page": next_page})
def login_verify(self, response): if response.url == self.login_verify_url: self.is_login = True self.login_time = time.mktime(time.strptime(\ response.headers['Date'], \ '%a, %d %b %Y %H:%M:%S %Z')) + (8 * 60 * 60) time.sleep(1) return [FormRequest(self.submit_url, formdata = { 'problem_id': self.problem_id, 'language': LANGUAGE.get(self.language, '0'), 'source': self.source, 'submit': 'Submit', 'encoded': '1' }, callback = self.after_submit, dont_filter = True )] else: return Request(self.start_urls[0], callback=self.parse_start_url)
def post_get_playlist(self, response): collection = self.db.playlist result = json.loads(response.body, encoding='utf-8')['result'] # inserted = collection.update({'id': result['id']}, result, upsert=True) # upsert=True??insert or update # logger.info('Update or Insert to playlist database[%s]' % (str(inserted),)) if result['id'] not in self.playlist_id_buffer: collection.insert(result) for song in result['tracks']: artists = [] for detail in song['artists']: artists.append(detail['name']) comment_url = 'http://music.163.com/weapi/v1/resource/comments/%s/?csrf_token=' % (song['commentThreadId'],) # ??FormRequest???POST?????????????? # Request(url, method='POST', body=json.dumps(data)) yield FormRequest(comment_url, formdata=self.post_data, callback=self.parse, meta={'m_id': song['id'], 'm_name': song['name'], 'artists': artists})
def parse(self, response): EVENTVALIDATION = response.xpath("//*[@id='__EVENTVALIDATION']/@value").extract() VIEWSTATE = response.xpath("//*[@id='__VIEWSTATE']/@value").extract() for i in range(1, 5): yield FormRequest( 'http://environmentclearance.nic.in/Search.aspx', headers = {'user-agent': 'Mozilla/5.0'}, formdata = { 'ww': 'rr|GridView1', '__LASTFOCUS': '', '__EVENTTARGET': 'GridView1', '__EVENTARGUMENT': 'Page${}'.format(i), '__VIEWSTATE': VIEWSTATE, '__EVENTVALIDATION': EVENTVALIDATION, 'a': 'rb1', 'dd1status': 'UPEChome', 'ddlyear': '-All Years-', 'ddlcategory': '-All Category-', 'ddlstate': '-All State-', 'textbox2': '', 'DropDownList1': 'UPEC' }, callback = self.parse_item )
def parse(self, response): all_urls = response.xpath('//div[@class="tit_sort"]//dl') if len(all_urls): for url in all_urls: category_name = url.xpath('./dt/a/text()').extract()[0] next_urls = url.xpath('.//em//a/@href').extract() for next_url in next_urls: class_id = re.search("list-(\d+)-(\d+)-(\d+)", next_url) c1 = class_id.group(1) c2 = class_id.group(2) c3 = class_id.group(3) next_url = "http://www.benlai.com/NewCategory/GetLuceneProduct" yield FormRequest(next_url, formdata={"c1": c1, "c2": c2, "c3": c3, "page": "1"}, callback=self.parse_data, meta={"cat": category_name, "c1": c1, "c2": c2, "c3": c3, "page": "1"}) # ?????????
def parse_data(self, response): item = AfscrapyItem() datas = json.loads(response.body.decode('utf-8')) for data in datas['ProductList']: item['goods_id'] = data['ProductSysNo'] item['shop_name'] = "??" item['category_name'] = response.meta["cat"] item['title'] = data['ProductName'] item['sales_num'] = 0 item['unit'] = "" item['price'] = data['ProductNowPrice'] item['location'] = "" yield item if len(datas['ProductList']): next_page = int(response.meta["page"]) + 1 yield FormRequest(response.url, formdata={"c1": response.meta['c1'], "c2": response.meta['c2'], "c3": response.meta['c3'], "page": str(next_page)}, callback=self.parse_data, meta={"cat": response.meta["cat"], "c1": response.meta['c1'], "c2": response.meta['c2'], "c3": response.meta['c3'], "page": str(next_page)})
def start_login(self,response): xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract_first() return [FormRequest('https://www.zhihu.com/login/phone_num',method='POST',meta={'cookiejar':response.meta['cookiejar']},formdata={ #'_xsrf':xsrf, 'password':'feifengwind', 'remember_me':"true", 'phone_num':'18983848805'}, callback=self.after_login )]
def start_requests(self): return [FormRequest(self.login_url, formdata = { 'user_id1': self.username, 'password1': self.password, 'B1': 'login', }, callback = self.after_login, )]
def parse(self, response): return [FormRequest("https://m.facebook.com/login.php", formdata={ 'email': self.email, 'pass': self.password }, callback=self.parse_post_login) ]
def start_requests(self): url = "http://per.spdb.com.cn/was5/web/search" for i in range(1, 3): formdata = { "page": str(i), "metadata": "finance_state|finance_no|finance_allname|finance_anticipate_rate|finance_limittime|finance_lmttime_info|finance_type|docpuburl|finance_ipo_enddate|finance_indi_ipominamnt|finance_indi_applminamnt", "channelid": "266906", "searchword": "(product_type=3)*finance_limittime = %*(finance_currency = 01)*(finance_state='???')" } yield FormRequest(url, callback=self.parse_model, formdata=formdata)
def start_requests(self): self.logger.info('Login') self.cookies['m-login'] = '0' for one in self.start_urls: yield FormRequest(one, cookies=self.cookies, formdata=self.frmdata, callback=self.parse, headers={ 'Referer': 'https://www.quora.com/'}, dont_filter=True)
def start_requests(self): for one in self.start_urls: yield FormRequest(one, cookies=self.cookies, formdata=self.frmdata, callback=self.parse, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Referer': 'https://www.quora.com/'})
def start_requests(self): for i, url in enumerate(self.start_urls): yield FormRequest(url, meta = {'cookiejar': i}, headers = self.headers, cookies = self.cookies, callback = self.parse, dont_filter = True)#jump to login page
def login(self, response): print('post_login') # FormRequeset.from_response?Scrapy???????, ??post?? self.headers["X-Requested-With"] = "XMLHttpRequest" self.headers["Referer"] = self.index_url return [FormRequest( url=self.login_url, formdata=self.login_formdata, headers=self.headers, callback=self.check_login_status, )]
def start_requests(self): for i, url in enumerate(self.start_urls): yield FormRequest( url = url, headers = { 'Accept': 'application/json', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.assetstore.unity3d.com', 'Referer': 'https://www.assetstore.unity3d.com/en/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 'Firefox/50.0', 'X-Kharma-Version': '0', 'X-Requested-With': 'UnityAssetStore', 'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41', }, method = 'POST', formdata = { 'current_package_id': '', 'hardware_hash': '', 'language_code': 'en', 'license_hash': '', }, meta = { 'download_timeout': 20, 'is_proxy': False, }, callback = self.get_unity_version, ) #??? unity asset store ?????? # ??????????
def parse_item(self, response): loader = ItemLoader(GaokaopaiZhiyeItem(), response) loader.add_value('url', response.url) loader.add_value('code', response.url, re=ur'-([^-]+)\.html') loader.add_css('name', u'.modTitle>h1::text') def parse_category(): for e in response.css(u'.catType>a'): yield { 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'), 'name': e.css('::text').extract_first(), } loader.add_value('category', list(parse_category())) loader.add_css('detail', u'.zhiyeShow') item = loader.load_item() return FormRequest( url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html', formdata={'code': item['code'][0]}, meta={'item': item}, dont_filter=True, callback=self.parse_majors )
def start_requests(self): return [ FormRequest("http://spys.ru/en/free-proxy-list/", formdata={'xpp': '3', 'xf1': '0', 'xf2' : '0'}, callback=self.parse) ] # Helper function to process the abstaction
def login(self, response): self.log('Logging in...') try: full_args, args, url, method, params = fill_login_form(response.url, response.body, self.login_user, self.login_pass) validated_url = self.url_valid(url, response.url) real_url = urlsplit(validated_url) result_db.add_to_result(method.upper(), real_url.scheme + "://" + real_url.hostname + real_url.path, list(dict(full_args).keys())) yield FormRequest(validated_url, method=method, formdata=args, callback=self.confirm_login, dont_filter=True) except Exception as e: print(e) self.log('Login failed') for start_url in self.start_urls: if (";" in start_url): split_arr = start_url.split(';') validated_url = split_arr[0] yield Request(url=validated_url, dont_filter=True, callback=self.parse_res) time.sleep(int(split_arr[1])) else: validated_url = start_url yield Request(url=validated_url, dont_filter=True, callback=self.parse_res) real_url = urlsplit(validated_url) if len(real_url.query) > 0 and self.get_ext(real_url.path) not in self.not_allowed: # only add to result if have parameters param_dict = parse_qs(real_url.query, keep_blank_values=True) result_db.add_to_result("GET", real_url.scheme + "://" + real_url.hostname + real_url.path, list(param_dict.keys())) if self.ignore_params: tag_url = real_url.scheme + "://" + real_url.hostname + real_url.path else: tag_url = validated_url for param in self.ignore_fields: if param in real_url.query: tag_url = real_url.path if tag_url not in self.urls_visited and self.get_ext(real_url.path) not in self.not_allowed: self.urls_visited.append(tag_url)
def start_requests(self): url = "https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8" requests = [] for i in range(1,60): formdata = {"q":"", "viewFlag":"A", "sortType":"default", "searchStyle":"", "searchRegion":"city:", "searchFansNum":"", "currentPage":str(i), "pageSize":"100"} request = FormRequest(url,callback=self.parse_model,formdata=formdata) requests.append(request) return requests
def start_requests(self): url = "http://pluto.babyun.com.cn/pluto/api/user/signin" return [FormRequest(url,meta={'cookiejar':1}, formdata = { 'password':'3aF9Ac3R4M76e', 'username':'admin', 'remember':'true', }, callback = self.after_login, )]
def start_requests(self): # used for checking that a ticker isn't downloaded twice self.requested_tickers = set() for category in self.categories: self.logger.info('POST request for category "' + category['name'] + '"') # return a POST request for getting the index list in this category group yield FormRequest(url="https://indexes.nasdaqomx.com/Index/DirectoryData", formdata={'categoryID': str(category['id'])}, meta={'exchange': category['name']}, callback=self.parse_categories)
def parse_categories(self, response): # unpack meta values exchange = response.meta['exchange'] # get a dict with the json data data = json.loads(response.body_as_unicode()) # for all instruments in the list for instrument in data['aaData']: ticker = instrument['Symbol'] name = instrument['Name'] paper_type = instrument['AssetType'] if ticker in self.requested_tickers: self.logger.warning('Ticker "' + ticker + '" has already been requested. Skipping') continue # POST request for historical data for this ticker self.logger.info('Sending POST request for ticker "' + ticker + '"') yield FormRequest(url="https://indexes.nasdaqomx.com/Index/HistoryData", formdata={ 'id': ticker, 'startDate': '1950-09-03T00:00:00.000', 'endDate': '2050-09-03T00:00:00.000', 'timeOfDay': 'EOD'}, meta={'ticker': ticker, 'name': name, 'paper_type': paper_type, 'exchange': exchange}, callback=self.parse_historical_data) # parse the POST response containing the ticker data
def parse(self, response): parent_path = response.xpath('//section[@id="m-category"]') for i in range(1, 9): category_name = parent_path.xpath("./ul/li["+str(i)+"]/a/text()").extract()[0] all_urls = parent_path.xpath(".//div/div["+str(i)+"]/ul/li/a/@href").extract() for url in all_urls: class_id = re.search('\d+', url).group() next_url = "http://m.fruitday.com/ajax/prolist/index" yield FormRequest(next_url, formdata={"class_id": class_id, "curr_page": "0"}, callback=self.parse_data, meta={"cat": category_name, "class_id": class_id, 'page': "0"}) # ?????????
def process_pagination_form(self, form, page=None, product_id=None): action = form.xpath('@action').extract_first() names = form.xpath('input/@name').extract() values = form.xpath('input/@value').extract() formdata = dict(zip(names, values)) meta = dict(prev_page=page, product_id=product_id) return FormRequest( url=action, method='GET', formdata=formdata, callback=self.parse, meta=meta )
def parse_product(self, response): # Circumvent age selection form. if '/agecheck/app' in response.url: logger.debug(f"Form-type age check triggered for {response.url}.") form = response.css('#agegate_box form') action = form.xpath('@action').extract_first() name = form.xpath('input/@name').extract_first() value = form.xpath('input/@value').extract_first() formdata = { name: value, 'ageDay': '1', 'ageMonth': '1', 'ageYear': '1955' } yield FormRequest( url=action, method='POST', formdata=formdata, callback=self.parse_product ) else: yield load_product(response)
def start_requests(self): count = self.sql.get_proxy_count(self.name) count_httpbin = self.sql.get_proxy_count(config.httpbin_table) ids = self.sql.get_proxy_ids(self.name) ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) for i in range(0, count + count_httpbin): table = self.name if (i < count) else config.httpbin_table id = ids[i] if i < count else ids_httpbin[i - len(ids)] proxy = self.sql.get_proxy_with_id(table, id) if proxy == None: continue for url in self.urls: cur_time = time.time() yield FormRequest( url = url, headers = self.headers, method = 'POST', meta = { 'cur_time': cur_time, 'download_timeout': self.timeout, 'proxy_info': proxy, 'table': table, 'id': proxy.get('id'), 'proxy': 'http://%s:%s' % (proxy.get('ip'), proxy.get('port')), 'vali_count': proxy.get('vali_count', 0), }, cookies = { 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488937030', '_ga': 'GA1.2.40497390.1488937014', 'TG-TRACK-CODE': 'search_code', 'index_location_city': '%E5%8C%97%E4%BA%AC', 'LGRID': '20170308093710-bf6755eb-039f-11e7-8025-525400f775ce', 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488881288,1488936799,1488936947,1488937014', 'JSESSIONID': 'BDCBB6167F960CE43AF54B75A651F586', 'LGSID': '20170308093653-b59316f0-039f-11e7-9229-5254005c3644', 'LGUID': '20170308093653-b593185f-039f-11e7-9229-5254005c3644', 'user_trace_token': '20170308093654-723efcfac8fb4c28a670d073d5113e02', 'SEARCH_ID': '4db4dc3dea1c46b49018ae5421b53ffa' }, formdata = { 'first': 'true', 'kd': 'ios', 'pn': '1', }, dont_filter = True, callback = self.success_parse, errback = self.error_parse, )
def start_requests(self): for url in self.start_urls: yield FormRequest(url,headers=self.header,callback=self.parse_item)
def start_requests(self): count = self.sql.get_proxy_count(self.name) count_httpbin = self.sql.get_proxy_count(config.httpbin_table) ids = self.sql.get_proxy_ids(self.name) ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) for i in range(0, count + count_httpbin): table = self.name if (i < count) else config.httpbin_table id = ids[i] if i < count else ids_httpbin[i - len(ids)] proxy = self.sql.get_proxy_with_id(table, id) if proxy == None: continue for url in self.urls: cur_time = time.time() yield FormRequest( url = url, headers = self.headers, method = 'POST', meta = { 'cur_time': cur_time, 'download_timeout': self.timeout, 'proxy_info': proxy, 'table': table, 'id': proxy.id, 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port), 'vali_count': proxy.vali_count, }, cookies = { 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488937030', '_ga': 'GA1.2.40497390.1488937014', 'TG-TRACK-CODE': 'search_code', 'index_location_city': '%E5%8C%97%E4%BA%AC', 'LGRID': '20170308093710-bf6755eb-039f-11e7-8025-525400f775ce', 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488881288,1488936799,1488936947,1488937014', 'JSESSIONID': 'BDCBB6167F960CE43AF54B75A651F586', 'LGSID': '20170308093653-b59316f0-039f-11e7-9229-5254005c3644', 'LGUID': '20170308093653-b593185f-039f-11e7-9229-5254005c3644', 'user_trace_token': '20170308093654-723efcfac8fb4c28a670d073d5113e02', 'SEARCH_ID': '4db4dc3dea1c46b49018ae5421b53ffa' }, formdata = { 'first': 'true', 'kd': 'ios', 'pn': '1', }, dont_filter = True, callback = self.success_parse, errback = self.error_parse, )
def getherproxy_req(self): """get proxy from gatherproxy.com""" block = True if not block: # method1-nonblock url = 'http://gatherproxy.com/proxylist/anonymity/?t=Elite' settings = Settings() @defer.inlineCallbacks def getpage(request,page): try: print("Request {},pagenumber:{}".format(request,page)) response = yield HTTP11DownloadHandler(settings).download_request(request,spider=None) if response.status==200: self._get_proxy(response.body.decode(),country=self.country) except Exception as e: print(e) print("[!] Failed: request {} of page:{}".format(request,page)) pass## def iter_page(): work =( getpage(FormRequest(url=url, headers=self.headers, formdata={'Type':'elite','PageIdx':str(page),'Uptime':'0'}, meta={'download_timeout':60}),page=page) for page in range(1,self.maxpage+1) ) coop = task.Cooperator() join = defer.DeferredList(coop.coiterate(work) for i in range(self.concurrent)) join.addBoth(lambda _: reactor.stop()) iter_page() reactor.run() else: # method 2- block url = 'http://gatherproxy.com/proxylist/anonymity/?t=Elite' for pagenum in range(1,self.maxpage+1): try: data = {'Type':'elite','PageIdx':str(pagenum),'Uptime':'0'} headers = copy.copy(self.headers) r = requests.post(url, headers=headers, data=data) except Exception as e: print(str(e)) print('[!] Failed: %s' % url) gatherproxy_list = [] return gatherproxy_list self._get_proxy(r.text,country=self.country)