From 8e87842bdbd8525c4fa6ec8f1bd95aa42ab9318b Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Wed, 15 Feb 2017 15:33:50 -0600 Subject: The Blaze added to new parser; also fixed Blaze desription fields --- parser.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'parser.py') diff --git a/parser.py b/parser.py index 1a306cf..b9a05b9 100644 --- a/parser.py +++ b/parser.py @@ -3,6 +3,7 @@ from unbiasedObjects import * from unbiasedFunctions import buildArticle import os +import re ''' @@ -164,6 +165,75 @@ def removeBadStories(source, badDescArr, badAuthorArr, badImgArr): return source +''' +Function to fix the oddly short og:descriptions provided +in The Blaze articles by grabbing the first portion of the story instead +''' +def blazeFixDesc(articleArr): + TAG_RE = re.compile(r'<[^>]+>') + for i in range(len(articleArr)): + desc=urlToContent(articleArr[i].url) + desc=desc.split('
', 1)[1] + desc=desc.split('

', 1)[1] + desc=TAG_RE.sub('', desc) + desc=desc.replace('\n', ' ') + desc=desc[:144] + print(desc+'\n\n') + articleArr[i].description=desc + + return articleArr + + + +def buildBlaze(): + url='http://theblaze.com' + name='The Blaze' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('', 1)[1] + h1=h1.split('', 1)[0] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '\n\n

\n\n