fix guardian images and image scaling

author: Matt Singleton <matt@xcolour.net> 2017-04-19 11:02:24 -0400
committer: Matt Singleton <matt@xcolour.net> 2017-04-19 11:02:24 -0400
commit: 48471019c86d9a78a742b282b1b25df6d69c5752 (patch)
tree: a21a9c6f6757a0fe090ce51e75ca8690e29ec36a
parent: c3d54bbe304708693891fe68cf3760c5fb2545b3 (diff)
3 files changed, 36 insertions, 12 deletions
diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css
index 244f100..6817cc9 100755
--- a/unbiased/html_template/unbiased.css
+++ b/unbiased/html_template/unbiased.css
@@ -84,7 +84,7 @@ a:hover{
     vertical-align:top;
     text-align:left;
     width:360px;
-    height:auto;
+    height:350px;
     overflow:hidden;
     background:#fff;
     margin:10px;
@@ -217,4 +217,4 @@ a:hover{
     clear:both;
     padding-top:4em;
     font-size:.8em;
-}
-\ No newline at end of file
+}
diff --git a/unbiased/main.py b/unbiased/main.py
index ea5412d..87b1e8c 100755
--- a/unbiased/main.py
+++ b/unbiased/main.py
@@ -48,10 +48,12 @@ def run(webroot, scratch):
     ### E.g. Guardian calls buildGuardian(), etc.
     sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS',
                  'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze'
-    
+
     for source in sourceFnArr:
+        logger.info('Crawling {}'.format(source))
         tries=0
         while tries<3:
+            time.sleep(tries)
             try:
                 fn='build'+source
                 possibles = globals().copy()
@@ -61,10 +63,12 @@ def run(webroot, scratch):
                 sourceList.append(src)
                 break
             except Exception as ex:
-                logger.error('Build error. Looping again. source={} ex={}'.format(source, ex))
                 tries+=1
-                time.sleep(tries)
-    
+                if tries == 3:
+                    logger.error('Build failed. source={} ex={}'.format(source, ex))
+                else:
+                    logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex))
+
     #scrape all urls and build data structure
     newsSourceArr=buildNewsSourceArr(sourceList, scratch)
 
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py
index 775346f..fdf9d8f 100644
--- a/unbiased/unbiasedFunctions.py
+++ b/unbiased/unbiasedFunctions.py
@@ -1,3 +1,4 @@
+import html
 import logging
 import os
 import pkgutil
@@ -32,7 +33,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
     f.close()
 
     try:
-        if sourceName=='The Guardian':
+        if sourceName=='The Guardian US':
             #The Guardian puts an identifying banner on their og:images
             #grab the main image from the page instead
 
@@ -48,14 +49,15 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
             elif '<img class="immersive-main-media__media"' in content:
                 img=content.split('<img class="immersive-main-media__media"', 1)[1]
                 img=img.split('src="', 1)[1].split('"')[0]
-            
+            img = html.unescape(img)
+
         else:
             if 'og:image' in content:
                 img=content.split('og:image" content=')[1][1:].split('>')[0]
             elif sourceName=='ABC News':
                 img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX'
             if img[-1]=='/':
-                #because the quote separator could be ' or ", 
+                #because the quote separator could be ' or ",
                 #trim to just before it then lop it off
                 img=img[:-1].strip()
             img=img[:-1]
@@ -282,16 +284,34 @@ def buildNewsSourceArr(sourceList, scratchDir):
     #since everything should have been modified in place
     return sourceList        
 
-def pullImage(url, index, webroot, width=350, height=200):
+def pullImage(url, index, webroot, target_width=350, target_height=200):
     extension = url.split('.')[-1].split('?')[0]
     img_name = 'img{}.{}'.format(index, extension)
     out_file = os.path.join(webroot, img_name)
     try:
         subprocess.check_call(['wget', '-q', '-O', out_file, '--no-check-certificate', url])
-    except Exception:
+    except Exception as ex:
+        logger.error('Failed to pull image: url={} ex={}'.format(url, ex))
         return ''
     img = Image.open(out_file)
-    img.resize((width, height))
+    # crop to aspect ratio
+    target_ar = target_width / target_height
+    left, top, right, bottom = img.getbbox()
+    height = bottom - top
+    width = right - left
+    ar = width / height
+    if target_ar > ar:
+        new_height = (target_height / target_width) * width
+        bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2))
+        img = img.crop(bbox)
+    elif target_ar < ar:
+        new_width = (target_width / target_height) * height
+        bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom)
+        img = img.crop(bbox)
+    # resize if larger
+    if target_width * 2 < width or target_height * 2 < height:
+        img = img.resize((target_width*2, target_height*2), Image.LANCZOS)
+    # TODO: create retina images
     jpg_name = 'img{}.jpg'.format(index)
     img.save(os.path.join(webroot, jpg_name), 'JPEG')
     return jpg_name
author	Matt Singleton <matt@xcolour.net>	2017-04-19 11:02:24 -0400
committer	Matt Singleton <matt@xcolour.net>	2017-04-19 11:02:24 -0400
commit	48471019c86d9a78a742b282b1b25df6d69c5752 (patch)
tree	a21a9c6f6757a0fe090ce51e75ca8690e29ec36a
parent	c3d54bbe304708693891fe68cf3760c5fb2545b3 (diff)