From e674ae4ca972e2f902dcc96d65fd4e792668b8a2 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 27 Sep 2017 21:17:17 -0400 Subject: let _normalize_urls() optionally preserve some query args --- unbiased/sources/base.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index e91e5a8..14d867e 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -68,7 +68,7 @@ class NewsSource(object): return BeautifulSoup(content, 'lxml') @classmethod - def _normalize_url(cls, url): + def _normalize_url(cls, url, keep_query_vars=None): """ Make sure they have a scheme. Make sure they have a host. @@ -76,7 +76,20 @@ class NewsSource(object): """ cls_url = urllib.parse.urlparse(cls.url) url = urllib.parse.urlparse(url) - url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', '', '') + if keep_query_vars is None: + query = '' + else: + query_vars = {} + qs = urllib.parse.parse_qs(url.query) + for v in keep_query_vars: + if v in qs: + query_vars[v] = qs[v] + query_pairs = [] + for k, i in query_vars.items(): + for v in i: + query_pairs.append('{}={}'.format(k, v)) + query = '?'.join(query_pairs) + url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', query, '') return urllib.parse.urlunparse(url) @classmethod -- cgit v1.2.3