aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStevan Radakovic <stevan.radakovic@linaro.org>2012-05-09 15:58:32 +0200
committerStevan Radakovic <stevan.radakovic@linaro.org>2012-05-09 15:58:32 +0200
commit149222932195d188278332877b8714cc0bde2831 (patch)
tree7a1c248986f777f0720e5efdc7359b8a169357bd
parent716594af8fddd535a0c36a9daf60c791cd010ac1 (diff)
Replace the filefetcher with newer version. Change the code so the old changes to filefetcher get() method are incorporated in the new one as well.
-rw-r--r--testing/filefetcher.py129
-rw-r--r--testing/license_protected_file_downloader.py281
-rw-r--r--testing/test_click_through_license.py2
3 files changed, 282 insertions, 130 deletions
diff --git a/testing/filefetcher.py b/testing/filefetcher.py
deleted file mode 100644
index d14e9f0..0000000
--- a/testing/filefetcher.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python
-
-# Changes required to address EULA for the origen hwpacks
-
-import argparse
-import os
-import pycurl
-import re
-import urlparse
-
-
-class LicenseProtectedFileFetcher:
- """Fetch a file from the web that may be protected by a license redirect
-
- This is designed to run on snapshots.linaro.org. License HTML file are in
- the form:
-
- <vendor>.html has a link to <vendor>-accept.html
-
- If self.get is pointed at a file that has to go through one of these
- licenses, it should be able to automatically accept the license and
- download the file.
-
- Once a license has been accepted, it will be used for all following
- downloads.
-
- If self.close() is called before the object is deleted, cURL will store
- the license accept cookie to cookies.txt, so it can be used for later
- downloads.
-
- """
- def __init__(self):
- """Set up cURL"""
- self.curl = pycurl.Curl()
- self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
- self.curl.setopt(pycurl.WRITEFUNCTION, self._write_body)
- self.curl.setopt(pycurl.HEADERFUNCTION, self._write_header)
- self.curl.setopt(pycurl.COOKIEFILE, "cookies.txt")
- self.curl.setopt(pycurl.COOKIEJAR, "cookies.txt")
-
- def _get(self, url):
- """Clear out header and body storage, fetch URL, filling them in."""
- self.curl.setopt(pycurl.URL, url)
-
- self.body = ""
- self.header = ""
-
- self.curl.perform()
-
- def get(self, url, ignore_license=False, accept_license=True):
- """Fetch the requested URL, ignoring license at all or
- accepting or declining licenses, returns file body.
-
- Fetches the file at url. If a redirect is encountered, it is
- expected to be to a license that has an accept or decline link.
- Follow that link, then download original file or nolicense notice.
-
- """
- self._get(url)
-
- if ignore_license:
- return self.body
-
- location = self._get_location()
- if location:
- # Off to the races - we have been redirected.
- # Expect to find a link to self.location with -accepted or
- # -declined inserted before the .html,
- # i.e. ste.html -> ste-accepted.html
-
- # Get the file from the URL (full path)
- file = urlparse.urlparse(location).path
-
- # Get the file without the rest of the path
- file = os.path.split(file)[-1]
-
- # Look for a link with accepted.html or declined.html
- # in the page name. Follow it.
- new_file = None
- for line in self.body.splitlines():
- if accept_license:
- link_search = re.search("""href=.*?["'](.*?-accepted.html)""",
- line)
- else:
- link_search = re.search("""href=.*?["'](.*?-declined.html)""",
- line)
- if link_search:
- # Have found license decline URL!
- new_file = link_search.group(1)
-
- if new_file:
- # accept or decline the license...
- next_url = re.sub(file, new_file, location)
- self._get(next_url)
-
- # The above get *should* take us to the file requested via
- # a redirect. If we manually need to follow that redirect,
- # do that now.
-
- if accept_license and self._get_location():
- # If we haven't been redirected to our original file,
- # we should be able to just download it now.
- self._get(url)
-
- return self.body
-
- def _search_header(self, field):
- """Search header for the supplied field, return field / None"""
- for line in self.header.splitlines():
- search = re.search(field + ":\s+(.*?)$", line)
- if search:
- return search.group(1)
- return None
-
- def _get_location(self):
- """Return content of Location field in header / None"""
- return self._search_header("Location")
-
- def _write_body(self, buf):
- """Used by curl as a sink for body content"""
- self.body += buf
-
- def _write_header(self, buf):
- """Used by curl as a sink for header content"""
- self.header += buf
-
- def close(self):
- """Wrapper to close curl - this will allow curl to write out cookies"""
- self.curl.close()
diff --git a/testing/license_protected_file_downloader.py b/testing/license_protected_file_downloader.py
new file mode 100644
index 0000000..d4ebacd
--- /dev/null
+++ b/testing/license_protected_file_downloader.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python
+
+import argparse
+import os
+import pycurl
+import re
+import urlparse
+import html2text
+from BeautifulSoup import BeautifulSoup
+
+class LicenseProtectedFileFetcher:
+ """Fetch a file from the web that may be protected by a license redirect
+
+ This is designed to run on snapshots.linaro.org. License HTML file are in
+ the form:
+
+ <vendor>.html has a link to <vendor>-accept.html
+
+ If self.get is pointed at a file that has to go through one of these
+ licenses, it should be able to automatically accept the license and
+ download the file.
+
+ Once a license has been accepted, it will be used for all following
+ downloads.
+
+ If self.close() is called before the object is deleted, cURL will store
+ the license accept cookie to cookies.txt, so it can be used for later
+ downloads.
+
+ """
+ def __init__(self, cookie_file="cookies.txt"):
+ """Set up cURL"""
+ self.curl = pycurl.Curl()
+ self.curl.setopt(pycurl.WRITEFUNCTION, self._write_body)
+ self.curl.setopt(pycurl.HEADERFUNCTION, self._write_header)
+ self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
+ self.curl.setopt(pycurl.COOKIEFILE, cookie_file)
+ self.curl.setopt(pycurl.COOKIEJAR, cookie_file)
+ self.file_out = None
+
+ def _get(self, url):
+ """Clear out header and body storage, fetch URL, filling them in."""
+ url = url.encode("ascii")
+ self.curl.setopt(pycurl.URL, url)
+
+ self.body = ""
+ self.header = ""
+
+ if self.file_name:
+ self.file_out = open(self.file_name, 'w')
+ else:
+ self.file_out = None
+
+ self.curl.perform()
+ self._parse_headers(url)
+
+ if self.file_out:
+ self.file_out.close()
+
+ def _parse_headers(self, url):
+ header = {}
+ for line in self.header.splitlines():
+ # Header lines typically are of the form thing: value...
+ test_line = re.search("^(.*?)\s*:\s*(.*)$", line)
+
+ if test_line:
+ header[test_line.group(1)] = test_line.group(2)
+
+ # The location attribute is sometimes relative, but we would
+ # like to have it as always absolute...
+ if 'Location' in header:
+ parsed_location = urlparse.urlparse(header["Location"])
+
+ # If not an absolute location...
+ if not parsed_location.netloc:
+ parsed_source_url = urlparse.urlparse(url)
+ new_location = ["", "", "", "", ""]
+
+ new_location[0] = parsed_source_url.scheme
+ new_location[1] = parsed_source_url.netloc
+ new_location[2] = header["Location"]
+
+ # Update location with absolute URL
+ header["Location"] = urlparse.urlunsplit(new_location)
+
+ self.header_text = self.header
+ self.header = header
+
+ def get_headers(self, url):
+ url = url.encode("ascii")
+ self.curl.setopt(pycurl.URL, url)
+
+ self.body = ""
+ self.header = ""
+
+ # Setting NOBODY causes CURL to just fetch the header.
+ self.curl.setopt(pycurl.NOBODY, True)
+ self.curl.perform()
+ self.curl.setopt(pycurl.NOBODY, False)
+
+ self._parse_headers(url)
+
+ return self.header
+
+ def get_or_return_license(self, url, file_name=None):
+ """Get file at the requested URL or, if behind a license, return that.
+
+ If the URL provided does not redirect us to a license, then return the
+ body of that file. If we are redirected to a license click through
+ then return (the license as plain text, url to accept the license).
+
+ If the user of this function accepts the license, then they should
+ call get_protected_file."""
+
+ self.file_name = file_name
+
+ # Get the license details. If this returns None, the file isn't license
+ # protected and we can just return the file we started to get in the
+ # function (self.body).
+ license_details = self._get_license(url)
+
+ if license_details:
+ return license_details
+
+ return self.body
+
+ def get(self, url, file_name=None, ignore_license=False, accept_license=True):
+ """Fetch the requested URL, accepting licenses
+
+ Fetches the file at url. If a redirect is encountered, it is
+ expected to be to a license that has an accept link. Follow that link,
+ then download the original file. Returns the fist 1MB of the file
+ (see _write_body).
+
+ """
+
+ self.file_name = file_name
+ if ignore_license:
+ self._get(url)
+ return self.body
+
+ license_details = self._get_license(url)
+
+ if license_details:
+ # Found a license.
+ if accept_license:
+ # Accept the license without looking at it and
+ # start fetching the file we originally wanted.
+ accept_url = license_details[1]
+ self.get_protected_file(accept_url, url)
+ else:
+ # We want to decline the license and return the notice.
+ decline_url = license_details[2]
+ self._get(decline_url)
+
+ else:
+ # If we got here, there wasn't a license protecting the file
+ # so we just fetch it.
+ self._get(url)
+
+ return self.body
+
+ def _get_license(self, url):
+ """Return (license, accept URL, decline URL) if found,
+ else return None.
+
+ """
+
+ self.get_headers(url)
+
+ if "Location" in self.header and self.header["Location"] != url:
+ # We have been redirected to a new location - the license file
+ location = self.header["Location"]
+
+ # Fetch the license HTML
+ self._get(location)
+
+ # Get the file from the URL (full path)
+ file = urlparse.urlparse(location).path
+
+ # Get the file without the rest of the path
+ file = os.path.split(file)[-1]
+
+ # Look for a link with accepted.html in the page name. Follow it.
+ for line in self.body.splitlines():
+ accept_search = re.search(
+ r"""href=.*?["'](.*?-accepted.html)""",
+ line)
+ decline_search = re.search(
+ r"""href=.*?["'](.*?-declined.html)""",
+ line)
+
+ if accept_search and decline_search:
+ # Have found license accept URL!
+ new_file = accept_search.group(1)
+ accept_url = re.sub(file, new_file, location)
+
+ # Found decline URL as well.
+ new_file_decline = decline_search.group(1)
+ decline_url = re.sub(file, new_file_decline, location)
+
+ # Parse the HTML using BeautifulSoup
+ soup = BeautifulSoup(self.body)
+
+ # The license is in a div with the ID license-text, so we
+ # use this to pull just the license out of the HTML.
+ html_license = u""
+ for chunk in soup.findAll(id="license-text"):
+ # Output of chunk.prettify is UTF8, but comes back
+ # as a str, so convert it here.
+ html_license += chunk.prettify().decode("utf-8")
+
+ text_license = html2text.html2text(html_license)
+
+ return text_license, accept_url, decline_url
+
+ return None
+
+ def get_protected_file(self, accept_url, url):
+ """Gets the file redirected to by the accept_url"""
+
+ self._get(accept_url) # Accept the license
+
+ if not("Location" in self.header and self.header["Location"] == url):
+ # If we got here, we don't have the file yet (weren't redirected
+ # to it). Fetch our target file. This should work now that we have
+ # the right cookie.
+ self._get(url) # Download the target file
+
+ return self.body
+
+ def _write_body(self, buf):
+ """Used by curl as a sink for body content"""
+
+ # If we have a target file to write to, write to it
+ if self.file_out:
+ self.file_out.write(buf)
+
+ # Only buffer first 1MB of body. This should be plenty for anything
+ # we wish to parse internally.
+ if len(self.body) < 1024*1024*1024:
+ # XXX Would be nice to stop keeping the file in RAM at all and
+ # passing large buffers around. Perhaps only keep in RAM if
+ # file_name == None? (used for getting directory listings
+ # normally).
+ self.body += buf
+
+ def _write_header(self, buf):
+ """Used by curl as a sink for header content"""
+ self.header += buf
+
+ def register_progress_callback(self, callback):
+ self.curl.setopt(pycurl.NOPROGRESS, 0)
+ self.curl.setopt(pycurl.PROGRESSFUNCTION, callback)
+
+ def close(self):
+ """Wrapper to close curl - this will allow curl to write out cookies"""
+ self.curl.close()
+
+def main():
+ """Download file specified on command line"""
+ parser = argparse.ArgumentParser(description="Download a file, accepting "
+ "any licenses required to do so.")
+
+ parser.add_argument('url', metavar="URL", type=str, nargs=1,
+ help="URL of file to download.")
+
+ args = parser.parse_args()
+
+ fetcher = LicenseProtectedFileFetcher()
+
+ # Get file name from URL
+ file_name = os.path.basename(urlparse.urlparse(args.url[0]).path)
+ if not file_name:
+ file_name = "downloaded"
+ fetcher.get(args.url[0], file_name)
+
+ fetcher.close()
+
+if __name__ == "__main__":
+ main()
diff --git a/testing/test_click_through_license.py b/testing/test_click_through_license.py
index ea7cf72..b8e1b3c 100644
--- a/testing/test_click_through_license.py
+++ b/testing/test_click_through_license.py
@@ -9,7 +9,7 @@ import socket
from testtools import TestCase
from testtools.matchers import Mismatch
-from filefetcher import LicenseProtectedFileFetcher
+from license_protected_file_downloader import LicenseProtectedFileFetcher
fetcher = LicenseProtectedFileFetcher()
cwd = os.getcwd()