这是此网页中的链接 https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa。 然后将其保存到此目录”/home/vinvin/shKLSE/(我正在使用 pythonaywhere)。然后将其解压缩,然后将csv文件解压缩到目录中。
该代码一直运行到最后,没有错误,但是没有下载。单击 会自动下载zip 文件
手动https://www.shareinvestor.com/prices/price_download_zip_file.zip?type=history_all&market=bursa。
我的代码带有有效的用户名和密码。使用真实的用户名和 密码,以便更轻松地理解问题。
#!/usr/bin/python print "hello from python 2" import urllib2 from selenium import webdriver from selenium.webdriver.common.keys import Keys import time from pyvirtualdisplay import Display import requests, zipfile, os display = Display(visible=0, size=(800, 600)) display.start() profile = webdriver.FirefoxProfile() profile.set_preference('browser.download.folderList', 2) profile.set_preference('browser.download.manager.showWhenStarting', False) profile.set_preference('browser.download.dir', "/home/vinvin/shKLSE/") profile.set_preference('browser.helperApps.neverAsk.saveToDisk', '/zip') for retry in range(5): try: browser = webdriver.Firefox(profile) print "firefox" break except: time.sleep(3) time.sleep(1) browser.get("https://www.shareinvestor.com/my") time.sleep(10) login_main = browser.find_element_by_xpath("//*[@href='/user/login.html']").click() print browser.current_url username = browser.find_element_by_id("sic_login_header_username") password = browser.find_element_by_id("sic_login_header_password") print "find id done" username.send_keys("bkcollection") password.send_keys("123456") print "log in done" login_attempt = browser.find_element_by_xpath("//*[@type='submit']") login_attempt.submit() browser.get("https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa") print browser.current_url time.sleep(20) dl = browser.find_element_by_xpath("//*[@href='/prices/price_download_zip_file.zip?type=history_all&market=bursa']").click() time.sleep(30) browser.close() browser.quit() display.stop() zip_ref = zipfile.ZipFile(/home/vinvin/sh/KLSE, 'r') zip_ref.extractall(/home/vinvin/sh/KLSE) zip_ref.close() os.remove(zip_ref)
HTML snippet:
<li><a href="/prices/price_download_zip_file.zip?type=history_all&market=bursa">All Historical Data</a> <span>About 220 MB</span></li>
源中隐藏的,所以我想它是用JavaScript编写的。
我发现的观察
The directory home/vinvin/shKLSE do not created even I run the code with no error
home/vinvin/shKLSE
I try to download a much smaller zip file which can be completed in a second but still do not download after a wait of 30s. dl = browser.find_element_by_xpath("//*[@href='/prices/price_download_zip_file.zip?type=history_daily&date=20170519&market=bursa']").click()
dl = browser.find_element_by_xpath("//*[@href='/prices/price_download_zip_file.zip?type=history_daily&date=20170519&market=bursa']").click()
我认为您的主要问题可能是错误的模仿类型,但是,您的脚本有系统性问题的日志,这会使它充其量是不可靠的。此重写使用显式等待,这完全消除了对use的需要time.sleep(),从而使其能够尽可能快地运行,同时还消除了由于网络拥塞而导致的错误。
您需要执行以下操作以确保已安装所有模块:
pip install requests explicit selenium retry pyvirtualdisplay
The script:
#!/usr/bin/python from __future__ import print_function # Makes your code portable import os import glob import zipfile from contextlib import contextmanager import requests from retry import retry from explicit import waiter, XPATH, ID from selenium import webdriver from pyvirtualdisplay import Display from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.wait import WebDriverWait DOWNLOAD_DIR = "/tmp/shKLSE/" def build_profile(): profile = webdriver.FirefoxProfile() profile.set_preference('browser.download.folderList', 2) profile.set_preference('browser.download.manager.showWhenStarting', False) profile.set_preference('browser.download.dir', DOWNLOAD_DIR) # I think your `/zip` mime type was incorrect. This works for me profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/vnd.ms-excel,application/zip') return profile # Retry is an elegant way to retry the browser creation # Though you should narrow the scope to whatever the actual exception is you are # retrying on @retry(Exception, tries=5, delay=3) @contextmanager # This turns get_browser into a context manager def get_browser(): # Use a context manager with Display, so it will be closed even if an # exception is thrown profile = build_profile() with Display(visible=0, size=(800, 600)): browser = webdriver.Firefox(profile) print("firefox") try: yield browser finally: # Let a try/finally block manage closing the browser, even if an # exception is called browser.quit() def main(): print("hello from python 2") with get_browser() as browser: browser.get("https://www.shareinvestor.com/my") # Click the login button # waiter is a helper function that makes it easy to use explicit waits # with it you dont need to use time.sleep() calls at all login_xpath = '//*/div[@class="sic_logIn-bg"]/a' waiter.find_element(browser, login_xpath, XPATH).click() print(browser.current_url) # Log in username = "bkcollection" username_id = "sic_login_header_username" password = "123456" password_id = "sic_login_header_password" waiter.find_write(browser, username_id, username, by=ID) waiter.find_write(browser, password_id, password, by=ID, send_enter=True) # Wait for login process to finish by locating an element only found # after logging in, like the Logged In Nav nav_id = 'sic_loggedInNav' waiter.find_element(browser, nav_id, ID) print("log in done") # Load the target page target_url = ("https://www.shareinvestor.com/prices/price_download.html#/?" "type=price_download_all_stocks_bursa") browser.get(target_url) print(browser.current_url) # CLick download button all_data_xpath = ("//*[@href='/prices/price_download_zip_file.zip?" "type=history_all&market=bursa']") waiter.find_element(browser, all_data_xpath, XPATH).click() # This is a bit challenging: You need to wait until the download is complete # This file is 220 MB, it takes a while to complete. This method waits until # there is at least one file in the dir, then waits until there are no # filenames that end in `.part` # Note that is is problematic if there is already a file in the target dir. I # suggest looking into using the tempdir module to create a unique, temporary # directory for downloading every time you run your script print("Waiting for download to complete") at_least_1 = lambda x: len(x("{0}/*.zip*".format(DOWNLOAD_DIR))) > 0 WebDriverWait(glob.glob, 300).until(at_least_1) no_parts = lambda x: len(x("{0}/*.part".format(DOWNLOAD_DIR))) == 0 WebDriverWait(glob.glob, 300).until(no_parts) print("Download Done") # Now do whatever it is you need to do with the zip file # zip_ref = zipfile.ZipFile(DOWNLOAD_DIR, 'r') # zip_ref.extractall(DOWNLOAD_DIR) # zip_ref.close() # os.remove(zip_ref) print("Done!") if __name__ == "__main__": main()
完全公开:我维护显式模块。它旨在使显式等待变得更容易,因为在这种情况下,网站会根据用户交互缓慢加载动态内容。您可以用直接显式等待替换上面的所有waiter.XXX调用。