From 48471019c86d9a78a742b282b1b25df6d69c5752 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 11:02:24 -0400 Subject: fix guardian images and image scaling --- unbiased/html_template/unbiased.css | 4 ++-- unbiased/main.py | 12 ++++++++---- unbiased/unbiasedFunctions.py | 32 ++++++++++++++++++++++++++------ 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 244f100..6817cc9 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -84,7 +84,7 @@ a:hover{ vertical-align:top; text-align:left; width:360px; - height:auto; + height:350px; overflow:hidden; background:#fff; margin:10px; @@ -217,4 +217,4 @@ a:hover{ clear:both; padding-top:4em; font-size:.8em; -} \ No newline at end of file +} diff --git a/unbiased/main.py b/unbiased/main.py index ea5412d..87b1e8c 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -48,10 +48,12 @@ def run(webroot, scratch): ### E.g. Guardian calls buildGuardian(), etc. sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze' - + for source in sourceFnArr: + logger.info('Crawling {}'.format(source)) tries=0 while tries<3: + time.sleep(tries) try: fn='build'+source possibles = globals().copy() @@ -61,10 +63,12 @@ def run(webroot, scratch): sourceList.append(src) break except Exception as ex: - logger.error('Build error. Looping again. source={} ex={}'.format(source, ex)) tries+=1 - time.sleep(tries) - + if tries == 3: + logger.error('Build failed. source={} ex={}'.format(source, ex)) + else: + logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) + #scrape all urls and build data structure newsSourceArr=buildNewsSourceArr(sourceList, scratch) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 775346f..fdf9d8f 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -1,3 +1,4 @@ +import html import logging import os import pkgutil @@ -32,7 +33,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t f.close() try: - if sourceName=='The Guardian': + if sourceName=='The Guardian US': #The Guardian puts an identifying banner on their og:images #grab the main image from the page instead @@ -48,14 +49,15 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t elif '')[0] elif sourceName=='ABC News': img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX' if img[-1]=='/': - #because the quote separator could be ' or ", + #because the quote separator could be ' or ", #trim to just before it then lop it off img=img[:-1].strip() img=img[:-1] @@ -282,16 +284,34 @@ def buildNewsSourceArr(sourceList, scratchDir): #since everything should have been modified in place return sourceList -def pullImage(url, index, webroot, width=350, height=200): +def pullImage(url, index, webroot, target_width=350, target_height=200): extension = url.split('.')[-1].split('?')[0] img_name = 'img{}.{}'.format(index, extension) out_file = os.path.join(webroot, img_name) try: subprocess.check_call(['wget', '-q', '-O', out_file, '--no-check-certificate', url]) - except Exception: + except Exception as ex: + logger.error('Failed to pull image: url={} ex={}'.format(url, ex)) return '' img = Image.open(out_file) - img.resize((width, height)) + # crop to aspect ratio + target_ar = target_width / target_height + left, top, right, bottom = img.getbbox() + height = bottom - top + width = right - left + ar = width / height + if target_ar > ar: + new_height = (target_height / target_width) * width + bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2)) + img = img.crop(bbox) + elif target_ar < ar: + new_width = (target_width / target_height) * height + bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom) + img = img.crop(bbox) + # resize if larger + if target_width * 2 < width or target_height * 2 < height: + img = img.resize((target_width*2, target_height*2), Image.LANCZOS) + # TODO: create retina images jpg_name = 'img{}.jpg'.format(index) img.save(os.path.join(webroot, jpg_name), 'JPEG') return jpg_name -- cgit v1.2.3