summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--main.py8
-rw-r--r--parser.py70
2 files changed, 76 insertions, 2 deletions
diff --git a/main.py b/main.py
index 09bfddc..cf68d01 100644
--- a/main.py
+++ b/main.py
@@ -15,6 +15,9 @@ def main():
def run():
sourceList=[]
+ blz=buildBlaze()
+ sourceList.append(blz)
+
bbc=buildBBC()
sourceList.append(bbc)
@@ -24,7 +27,8 @@ def run():
cbs=buildCBS()
sourceList.append(cbs)
-
+
+ '''
sourceList.append(NewsSource('The Blaze',
'http://theblaze.com',
['<a class="gallery-link" href="'],
@@ -33,7 +37,7 @@ def run():
'<!-- home -->', '<!-- loop-home -->',
'<!-- home -->', '<!-- loop-home -->',
None, None))
-
+ '''
wkl=buildWeeklyStandard()
sourceList.append(wkl)
diff --git a/parser.py b/parser.py
index 1a306cf..b9a05b9 100644
--- a/parser.py
+++ b/parser.py
@@ -3,6 +3,7 @@
from unbiasedObjects import *
from unbiasedFunctions import buildArticle
import os
+import re
'''
@@ -164,6 +165,75 @@ def removeBadStories(source, badDescArr, badAuthorArr, badImgArr):
return source
+'''
+Function to fix the oddly short og:descriptions provided
+in The Blaze articles by grabbing the first portion of the story instead
+'''
+def blazeFixDesc(articleArr):
+ TAG_RE = re.compile(r'<[^>]+>')
+ for i in range(len(articleArr)):
+ desc=urlToContent(articleArr[i].url)
+ desc=desc.split('<div class="entry-content article-styles">', 1)[1]
+ desc=desc.split('<p>', 1)[1]
+ desc=TAG_RE.sub('', desc)
+ desc=desc.replace('\n', ' ')
+ desc=desc[:144]
+ print(desc+'\n\n')
+ articleArr[i].description=desc
+
+ return articleArr
+
+
+
+def buildBlaze():
+ url='http://theblaze.com'
+ name='The Blaze'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<!-- home -->', 1)[1]
+ h1=h1.split('<!-- loop-home -->', 1)[0]
+ h1=h1.split('<a class="gallery-link" href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[url+h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<!-- home -->', 1)[1]
+ h2=h2.split('<!-- loop-home -->', 1)[0]
+ while '</figure>\n\n<figure class="gallery-item">' in h2:
+ h2=h2.split('</figure>\n\n<figure class="gallery-item">', 1)[1]
+ h2=h2.split('href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(url+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<!-- loop-home -->', 1)[1]
+ #this story section goes on forever; just grab the first 5
+ while len(h3s)<5:
+ h3=h3.split('<a class="feed-link" href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append(url+x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ blz=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ #The Blaze has dumb, short description fields, so we need to grab
+ #the first x characters of actual article text instead
+ blz.h1Arr=blazeFixDesc(blz.h1Arr)
+ blz.h2Arr=blazeFixDesc(blz.h2Arr)
+ blz.h3Arr=blazeFixDesc(blz.h3Arr)
+
+ return blz
+
def buildCBS():