diff options
-rwxr-xr-x | unbiased/html_template/unbiased.css | 4 | ||||
-rwxr-xr-x | unbiased/main.py | 12 | ||||
-rw-r--r-- | unbiased/unbiasedFunctions.py | 32 |
3 files changed, 36 insertions, 12 deletions
diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 244f100..6817cc9 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -84,7 +84,7 @@ a:hover{ vertical-align:top;
text-align:left;
width:360px;
- height:auto;
+ height:350px;
overflow:hidden;
background:#fff;
margin:10px;
@@ -217,4 +217,4 @@ a:hover{ clear:both;
padding-top:4em;
font-size:.8em;
-}
\ No newline at end of file +}
diff --git a/unbiased/main.py b/unbiased/main.py index ea5412d..87b1e8c 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -48,10 +48,12 @@ def run(webroot, scratch): ### E.g. Guardian calls buildGuardian(), etc. sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze' - + for source in sourceFnArr: + logger.info('Crawling {}'.format(source)) tries=0 while tries<3: + time.sleep(tries) try: fn='build'+source possibles = globals().copy() @@ -61,10 +63,12 @@ def run(webroot, scratch): sourceList.append(src) break except Exception as ex: - logger.error('Build error. Looping again. source={} ex={}'.format(source, ex)) tries+=1 - time.sleep(tries) - + if tries == 3: + logger.error('Build failed. source={} ex={}'.format(source, ex)) + else: + logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) + #scrape all urls and build data structure newsSourceArr=buildNewsSourceArr(sourceList, scratch) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 775346f..fdf9d8f 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -1,3 +1,4 @@ +import html
import logging
import os
import pkgutil
@@ -32,7 +33,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t f.close()
try:
- if sourceName=='The Guardian':
+ if sourceName=='The Guardian US':
#The Guardian puts an identifying banner on their og:images
#grab the main image from the page instead
@@ -48,14 +49,15 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t elif '<img class="immersive-main-media__media"' in content:
img=content.split('<img class="immersive-main-media__media"', 1)[1]
img=img.split('src="', 1)[1].split('"')[0]
-
+ img = html.unescape(img)
+
else:
if 'og:image' in content:
img=content.split('og:image" content=')[1][1:].split('>')[0]
elif sourceName=='ABC News':
img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX'
if img[-1]=='/':
- #because the quote separator could be ' or ",
+ #because the quote separator could be ' or ",
#trim to just before it then lop it off
img=img[:-1].strip()
img=img[:-1]
@@ -282,16 +284,34 @@ def buildNewsSourceArr(sourceList, scratchDir): #since everything should have been modified in place
return sourceList
-def pullImage(url, index, webroot, width=350, height=200):
+def pullImage(url, index, webroot, target_width=350, target_height=200):
extension = url.split('.')[-1].split('?')[0]
img_name = 'img{}.{}'.format(index, extension)
out_file = os.path.join(webroot, img_name)
try:
subprocess.check_call(['wget', '-q', '-O', out_file, '--no-check-certificate', url])
- except Exception:
+ except Exception as ex:
+ logger.error('Failed to pull image: url={} ex={}'.format(url, ex))
return ''
img = Image.open(out_file)
- img.resize((width, height))
+ # crop to aspect ratio
+ target_ar = target_width / target_height
+ left, top, right, bottom = img.getbbox()
+ height = bottom - top
+ width = right - left
+ ar = width / height
+ if target_ar > ar:
+ new_height = (target_height / target_width) * width
+ bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2))
+ img = img.crop(bbox)
+ elif target_ar < ar:
+ new_width = (target_width / target_height) * height
+ bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom)
+ img = img.crop(bbox)
+ # resize if larger
+ if target_width * 2 < width or target_height * 2 < height:
+ img = img.resize((target_width*2, target_height*2), Image.LANCZOS)
+ # TODO: create retina images
jpg_name = 'img{}.jpg'.format(index)
img.save(os.path.join(webroot, jpg_name), 'JPEG')
return jpg_name
|