|
40 | 40 | 'invite': 'invited',
|
41 | 41 | }
|
42 | 42 |
|
43 |
| -FAILED_RESOLVE_URL_CACHE_TIME = 60 * 60 * 24 # a day |
44 |
| - |
45 | 43 | # maps lower case string short name to Source subclass. populated by SourceMeta.
|
46 | 44 | sources = {}
|
47 | 45 |
|
@@ -497,7 +495,7 @@ def original_post_discovery(activity, domains=None, cache=None,
|
497 | 495 | # check for redirect and add their final urls
|
498 | 496 | redirects = {} # maps final URL to original URL for redirects
|
499 | 497 | for url in list(candidates):
|
500 |
| - resolved = follow_redirects(url, cache=cache, **kwargs) |
| 498 | + resolved = util.follow_redirects(url, cache=cache, **kwargs) |
501 | 499 | if (resolved.url != url and
|
502 | 500 | resolved.headers.get('content-type', '').startswith('text/html')):
|
503 | 501 | redirects[resolved.url] = url
|
@@ -767,65 +765,3 @@ def _html_to_text(self, html):
|
767 | 765 | return '\n'.join(
|
768 | 766 | # strip trailing whitespace that html2text adds to ends of some lines
|
769 | 767 | line.rstrip() for line in h.unescape(h.handle(html)).splitlines())
|
770 |
| - |
771 |
| -def follow_redirects(url, cache=None, **kwargs): |
772 |
| - """Fetches a URL with HEAD, repeating if necessary to follow redirects. |
773 |
| -
|
774 |
| - Caches resolved URLs in memcache by default. *Does not* raise an exception if |
775 |
| - any of the HTTP requests fail, just returns the failed response. If you care, |
776 |
| - be sure to check the returned response's status code! |
777 |
| -
|
778 |
| - Args: |
779 |
| - url: string |
780 |
| - cache: optional, a cache object to read and write resolved URLs to. Must |
781 |
| - have get(key) and set(key, value, time=...) methods. Stores |
782 |
| - 'R [original URL]' in key, final URL in value. |
783 |
| - **kwargs: passed to requests.head() |
784 |
| -
|
785 |
| - Returns: |
786 |
| - the requests.Response for the final request |
787 |
| - """ |
788 |
| - if cache is not None: |
789 |
| - cache_key = 'R ' + url |
790 |
| - resolved = cache.get(cache_key) |
791 |
| - if resolved is not None: |
792 |
| - return resolved |
793 |
| - |
794 |
| - # can't use urllib2 since it uses GET on redirect requests, even if i specify |
795 |
| - # HEAD for the initial request. |
796 |
| - # http://stackoverflow.com/questions/9967632 |
797 |
| - try: |
798 |
| - # default scheme to http |
799 |
| - parsed = urlparse.urlparse(url) |
800 |
| - if not parsed.scheme: |
801 |
| - url = 'http://' + url |
802 |
| - resolved = util.requests_head(url, allow_redirects=True, **kwargs) |
803 |
| - resolved.raise_for_status() |
804 |
| - if resolved.url != url: |
805 |
| - logging.debug('Resolved %s to %s', url, resolved.url) |
806 |
| - cache_time = 0 # forever |
807 |
| - except AssertionError: |
808 |
| - raise |
809 |
| - except BaseException, e: |
810 |
| - logging.warning("Couldn't resolve URL %s : %s", url, e) |
811 |
| - resolved = requests.Response() |
812 |
| - resolved.url = url |
813 |
| - resolved.status_code = 499 # not standard. i made this up. |
814 |
| - cache_time = FAILED_RESOLVE_URL_CACHE_TIME |
815 |
| - |
816 |
| - content_type = resolved.headers.get('content-type') |
817 |
| - if not content_type: |
818 |
| - type, _ = mimetypes.guess_type(resolved.url) |
819 |
| - resolved.headers['content-type'] = type or 'text/html' |
820 |
| - |
821 |
| - refresh = resolved.headers.get('refresh') |
822 |
| - if refresh: |
823 |
| - for part in refresh.split(';'): |
824 |
| - if part.strip().startswith('url='): |
825 |
| - return follow_redirects(part.strip()[4:], cache=cache, **kwargs) |
826 |
| - |
827 |
| - resolved.url = util.clean_url(resolved.url) |
828 |
| - if cache is not None: |
829 |
| - cache.set_multi({cache_key: resolved, 'R ' + resolved.url: resolved}, |
830 |
| - time=cache_time) |
831 |
| - return resolved |
0 commit comments