如何修复与 https NSIDC/NASA 网站的连接？-解网

问：

我一直在使用 python 代码从 NSIDC https 网站搜索和下载 SMAP 卫星数据。我的代码一直工作到上周开始出现错误：

urllib2 中。HTTPError：HTTP 错误 404：未找到

有什么帮助吗？

该代码是从 NSIDC 网站改编而来的，该网站建议完全按照我的需要去做。示例如下：

"""This script, NSIDC_parse_HTML_BatchDL.py, defines an HTML parser to scrape data files from an earthdata HTTPS URL and bulk downloads all files to your working directory.

This code was adapted from https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+Python
Last edited Jan 26, 2017 G. Deemer""" 

import urllib2
import os
from cookielib import CookieJar
from HTMLParser import HTMLParser

# Define a custom HTML parser to scrape the contents of the HTML data table
class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.inLink = False
        self.dataList = []
        self.directory = '/'
        self.indexcol = ';'
        self.Counter = 0

    def handle_starttag(self, tag, attrs):
       self.inLink = False
       if tag == 'table':
           self.Counter += 1
        if tag == 'a':
            for name, value in attrs:
                if name == 'href':
                    if self.directory in value or self.indexcol in value:
                        break
                    else:
                        self.inLink = True
                        self.lasttag = tag

    def handle_endtag(self, tag):
            if tag == 'table':
                self.Counter +=1

    def handle_data(self, data):
        if self.Counter == 1:
            if self.lasttag == 'a' and self.inLink and data.strip():
                self.dataList.append(data)

parser = MyHTMLParser() 

# Define function for batch downloading
def BatchJob(Files, cookie_jar):
    for dat in Files:
        print "downloading: ", dat
        JobRequest = urllib2.Request(url+dat)
        JobRequest.add_header('cookie', cookie_jar) # Pass the saved cookie into additional HTTP request
        JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401'

        # Request the resource at the modified redirect url
        Request = urllib2.Request(JobRedirect_url)
        Response = urllib2.urlopen(Request)
        f = open( dat, 'wb')
        f.write(Response.read())
        f.close()
        Response.close()
    print "Files downloaded to: ", os.path.dirname(os.path.realpath(__file__))
#===========================================================================
# The following code block is used for HTTPS authentication
#===========================================================================

# The user credentials that will be used to authenticate access to the data
username = "user"
password = "password"

# The FULL url of the directory which contains the files you would like to bulk download

url = "https://n5eil01u.ecs.nsidc.org/SMAP/SPL4SMGP.003/2017.10.14/" # Example URL
# Create a password manager to deal with the 401 reponse that is returned from
# Earthdata Login

password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password)

# Create a cookie jar for storing cookies. This is used to store and return
# the session cookie given to use by the data server (otherwise it will just
# keep sending us back to Earthdata Login to authenticate).  Ideally, we
# should use a file based cookie jar to preserve cookies between runs. This
# will make it much more efficient.

cookie_jar = CookieJar()

# Install all the handlers.
opener = urllib2.build_opener(
    urllib2.HTTPBasicAuthHandler(password_manager),
    #urllib2.HTTPHandler(debuglevel=1),    # Uncomment these two lines to see
    #urllib2.HTTPSHandler(debuglevel=1),   # details of the requests/responses
    urllib2.HTTPCookieProcessor(cookie_jar))
urllib2.install_opener(opener)

# Create and submit the requests. There are a wide range of exceptions that
# can be thrown here, including HTTPError and URLError. These should be
# caught and handled.

#===========================================================================
# Open a requeset to grab filenames within a directory. Print optional
#===========================================================================

DirRequest = urllib2.Request(url)
DirResponse = urllib2.urlopen(DirRequest)

# Get the redirect url and append 'app_type=401'
# to do basic http auth
DirRedirect_url = DirResponse.geturl()
DirRedirect_url += '&app_type=401'

# Request the resource at the modified redirect url
DirRequest = urllib2.Request(DirRedirect_url)
DirResponse = urllib2.urlopen(DirRequest)

DirBody = DirResponse.read(DirResponse)

# Uses the HTML parser defined above to pring the content of the directory containing data
parser.feed(DirBody)
Files = parser.dataList

# Display the contents of the python list declared in the HTMLParser class
# print Files #Uncomment to print a list of the files

#=========================================================================
# Call the function to download all files in url
#=========================================================================

BatchJob(Files, cookie_jar) # Comment out to prevent downloading to your working directory

python-2.7 下载 html-parsing urllib2

0赞 t.m.adam 10/22/2017

你为什么不用于你的请求？结果是 404 响应。urlDirRedirect_url

0赞 Iron Banker Of Braavos 10/22/2017

@t.m.adam，因为代码使用 URL 库来查找网站上每个文件夹中的文件。因为这样，有这个挤出，在网站上检查，获取可用文件并下载数据。

答：

0赞 Iron Banker Of Braavos 10/24/2017 #1

我可以使用直接加载网站并选择要下载的图像来修复该错误。如上代码所示。

"""This script, NSIDC_parse_HTML_BatchDL.py, defines an HTML parser to scrape data files from an earthdata HTTPS URL and bulk downloads all files to your working directory.

This code was adapted from https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+Python Last edited Jan 26, 2017 G. Deemer""" 

import urllib2
import os
from cookielib import CookieJar


# Define function for batch downloading
def BatchJob(Files, cookie_jar):
   for dat in Files:
      print "downloading: ", dat
      JobRequest = urllib2.Request(url+dat)
      JobRequest.add_header('cookie', cookie_jar) # Pass the saved cookie into additional HTTP request
     JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401'

     # Request the resource at the modified redirect url
     Request = urllib2.Request(JobRedirect_url)
     Response = urllib2.urlopen(Request)
     f = open( dat, 'wb')
     f.write(Response.read())
     f.close()
     Response.close()
 print "Files downloaded to: ", os.path.dirname(os.path.realpath(__file__))
 #==========================================================================
 # The following code block is used for HTTPS authentication
 #==========================================================================

 # The user credentials that will be used to authenticate access to the data
 username = "user"
 password = "password"

 # The FULL url of the directory which contains the files you would like to bulk download

url = "https://n5eil01u.ecs.nsidc.org/SMAP/SPL4SMGP.003/2017.10.14/" # Example URL
# Create a password manager to deal with the 401 reponse that is returned from # Earthdata Login

password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_manager.add_password(None, 
                              "https://urs.earthdata.nasa.gov", 
                              username, password)

# Create a cookie jar for storing cookies. This is used to store and return
# the session cookie given to use by the data server (otherwise it will just
# keep sending us back to Earthdata Login to authenticate).  Ideally, we
# should use a file based cookie jar to preserve cookies between runs. This
# will make it much more efficient.

cookie_jar = CookieJar()

# Install all the handlers.
opener = urllib2.build_opener(
    urllib2.HTTPBasicAuthHandler(password_manager),
    #urllib2.HTTPHandler(debuglevel=1),    # Uncomment these two lines to see
    #urllib2.HTTPSHandler(debuglevel=1),   # details of the requests/responses
    urllib2.HTTPCookieProcessor(cookie_jar))
urllib2.install_opener(opener)

# Create and submit the requests. There are a wide range of exceptions that
# can be thrown here, including HTTPError and URLError. These should be
# caught and handled.

#===========================================================================
# Open a requeset to grab filenames within a directory. Print optional
#===========================================================================

DirResponse = urllib2.urlopen(url)
htmlPage = DirResponse.read()

listFiles = [x.split(">")[0].replace('"', "")
                     for x in htmlPage.split("><a href=") if x.split(">")[0].endswith('.h5"') == True]

# Display the contents of the python list declared in the HTMLParser class
# print Files #Uncomment to print a list of the files

#=========================================================================
# Call the function to download all files in url
#=========================================================================

BatchJob(Files, cookie_jar) # Comment out to prevent downloading to your working directory

上一个：拆分变量并在 HTML 树中添加级别

下一个：在网络爬虫中解析 HTML 页面

如何修复与 https NSIDC/NASA 网站的连接？

How to fix a connection to https NSIDC/NASA website?

评论