From 511b3ba3f9de0d38e861833d6bcd7160487af111 Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Tue, 14 Feb 2017 22:05:39 -0600 Subject: Weekly Standard now uses new parser --- main.py | 11 ++--------- parser.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index e26c8c2..948bf36 100644 --- a/main.py +++ b/main.py @@ -58,15 +58,8 @@ def run(): None, None)) - sourceList.append(NewsSource('Weekly Standard', - 'http://www.weeklystandard.com/', - ['
', 'href="'], - ['
', 'href="'], - [], - '
', '
', 1)[1] + h1=h1.split('href="', 1)[1] + h1=h1.split('"', 1)[0] + h1s=[h1] + + #GET SECONDARY HEADLINES + h2=content + h2s=[] + h2=h2.split('
', 1)[1] + h2=h2.split('
' in h2: + h2=h2.split('
', 1)[1] + h2=h2.split('href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('Today\'s Standard', 1)[1] + h3=h3.split('
' in h3: + h3=h3.split('
', 1)[1] + h3=h3.split('href="', 1)[1] + x=h3.split('"', 1)[0] + if h1 not in x: + h3s.append(x) + + #Need to add URL prefix to all URLs + for i in range(len(h1s)): + h1s[i]=url+h1s[i] + for i in range(len(h2s)): + h2s[i]=url+h2s[i] + for i in range(len(h3s)): + h3s[i]=url+h3s[i] + + + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) + wkl=buildNewsSource2(name, url, h1s, h2s, h3s) + + #REMOVE BAD STORIES + #if it's in the h1 slot, bump up the first h2 into the h1 slot + for h1 in wkl.h1Arr: + if 'Matt Labash' in h1.description: + wkl.h1Arr.remove(h1) + wkl.h1Arr.append(wkl.h2Arr[0]) + wkl.h2Arr.remove(wkl.h2Arr[0]) + print('removed '+h1.title) + + return wkl + + + + def buildFoxNews(): url='http://foxnews.com' name='Fox News' -- cgit v1.2.3