From 4622a264b8d6e0446a52d96b7df220d357c082a9 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sun, 16 Apr 2017 23:21:22 -0400 Subject: move files around for packaging reasons --- .gitignore | 4 +- html_template/newtemplate.html | 150 ----- html_template/template.html | 173 ------ html_template/unbiased.css | 220 ------- main.py | 70 --- parser.py | 986 -------------------------------- scratch/do_not_delete | 0 spotCheck.py | 41 -- unbiased/html_template/newtemplate.html | 150 +++++ unbiased/html_template/template.html | 173 ++++++ unbiased/html_template/unbiased.css | 220 +++++++ unbiased/main.py | 70 +++ unbiased/parser.py | 986 ++++++++++++++++++++++++++++++++ unbiased/scratch/do_not_delete | 0 unbiased/spotCheck.py | 41 ++ unbiased/unbiasedFunctions.py | 259 +++++++++ unbiased/unbiasedObjects.py | 90 +++ unbiasedFunctions.py | 259 --------- unbiasedObjects.py | 90 --- 19 files changed, 1991 insertions(+), 1991 deletions(-) delete mode 100644 html_template/newtemplate.html delete mode 100755 html_template/template.html delete mode 100755 html_template/unbiased.css delete mode 100755 main.py delete mode 100755 parser.py delete mode 100644 scratch/do_not_delete delete mode 100755 spotCheck.py create mode 100644 unbiased/html_template/newtemplate.html create mode 100755 unbiased/html_template/template.html create mode 100755 unbiased/html_template/unbiased.css create mode 100755 unbiased/main.py create mode 100755 unbiased/parser.py create mode 100644 unbiased/scratch/do_not_delete create mode 100755 unbiased/spotCheck.py create mode 100644 unbiased/unbiasedFunctions.py create mode 100644 unbiased/unbiasedObjects.py delete mode 100644 unbiasedFunctions.py delete mode 100644 unbiasedObjects.py diff --git a/.gitignore b/.gitignore index 65c8f8e..90bf98d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,10 @@ *.pyc *~ __pycache__/ -scratch/*.html +unbiased/scratch/*.html legacy_py/ unbiased.html html_template/Penguins.jpg html_template/BAK* #* -.#* \ No newline at end of file +.#* diff --git a/html_template/newtemplate.html b/html_template/newtemplate.html deleted file mode 100644 index 0cec766..0000000 --- a/html_template/newtemplate.html +++ /dev/null @@ -1,150 +0,0 @@ - - - - - - UnBiased - - - - - -
-
- -
- -
-
-
Rand Paul and Cory Booker push bipartisan effort to limit solitary confinement for juveniles
-
-
Sen. Rand Paul (R-Ky) and Sen …
-
- -
- -
-
-
Bibi and Donald
-
-
This week, Israel's prime minister will visit Washington and meet with our new president. They will have a complex agenda. Benjamin ...
-
- -
- -
-
-
David Oyelowo on How to Play a Real King
-
-
He stars in “A United Kingdom,” about the Botswana leader who married a white woman and set off an international crisis.
-
- -
- -
-
-
Judge orders Ohio village to pay back $3 million to lead-footed drivers
-
-
Speed cameras became a cash cow for the small village of New Miami, Ohio.
-
- -
- - - -
-
- xxTitle3-1xx -
- -
- xxTitle3-2xx -
- -
- xxTitle3-3xx -
- -
- xxTitle3-4xx -
- -
- xxTitle3-5xx -
- -
- xxTitle3-6xx -
- -
- xxTitle3-7xx -
- -
- xxTitle3-8xx -
-
- -
- -
- Sources: BBC US, NBC News, CBS News, The Blaze, Weekly Standard, New York Times, Fox News -
- - diff --git a/html_template/template.html b/html_template/template.html deleted file mode 100755 index fc17006..0000000 --- a/html_template/template.html +++ /dev/null @@ -1,173 +0,0 @@ - - - - - - - UnBiased - - - - - -
-
-
- -
- -
-
xxTitle1-1xx
-
-
xxDesc1-1xx
-
- -
- -
-
-
xxTitle1-2xx
-
-
xxDesc1-2xx
-
- -
- -
- -
- -
-
-
xxTitle1-3xx
-
-
xxDesc1-3xx
-
- -
- -
-
-
xxTitle1-4xx
-
-
xxDesc1-4xx
-
- -
- -
- - - -
-
- xxTitle3-1xx -
- -
- xxTitle3-2xx -
- -
- xxTitle3-3xx -
- -
- xxTitle3-4xx -
- -
- xxTitle3-5xx -
- -
- xxTitle3-6xx -
- -
- xxTitle3-7xx -
- -
- xxTitle3-8xx -
- -
- xxTitle3-9xx -
- -
- xxTitle3-10xx -
- -
- xxTitle3-11xx -
- -
- xxTitle3-12xx -
-
- -
- -
- Sources: xxSourcesxx -
- - diff --git a/html_template/unbiased.css b/html_template/unbiased.css deleted file mode 100755 index 244f100..0000000 --- a/html_template/unbiased.css +++ /dev/null @@ -1,220 +0,0 @@ -/*body{ - width:900px; - margin-left:auto; - margin-right:auto; -}*/ - - -body{ - margin:0; -} - -a:link, a:visited, a:hover, a:active { - color: #00f; - text-decoration:none; - } - -a:hover{ - cursor:pointer; -} - -#page-header{ - width:100%; - text-align:center; - padding:.5em 0 1em; - margin-bottom:1em; - border-bottom:3px solid #BB133E; - background:#002147; -} - -.title{ - font-size:3em; -} - -#title-1{ - font-style:italic; - color:#fff; -} - -#title-2{ - color:#fff; -} - -#subtitle{ - font-size:1.25em; - color:#ccc; -} - -#timestamp{ - margin:.5em 0 0 0; - font-size:.8em; - color:#cc6; -} - -#page-container{ - width:900px; - margin-left:auto; - margin-right:auto; -} - -@media only screen and (max-width:900px){ - #page-container{ - width:100% - } -} - -#top-stories{ - width:95%; - display:block; - overflow:auto; - padding:10px; - margin-left:auto; - margin-right:auto; - text-align:center; - border-bottom: 3px solid #BB133E; - margin-bottom: 10px; -} - -.row{ - display:flex; -} - -.top-story{ - display:inline-block; - vertical-align:top; - text-align:left; - width:360px; - height:auto; - overflow:hidden; - background:#fff; - margin:10px; - padding:10px; - border:2px solid #ccc; - flex:1; -} - -@media only screen and (max-width:500px){ - .row{ - display:block; - } - .top-story{ - display:block; - width:auto; - height:auto; - } -} - -.top-stories-img{ - width:350px; - height:200px; - overflow:hidden; - background-size: auto 234px;/*cover;*/ - background-position: top center;/*center center;*/ - margin:0 auto; -} - -@media only screen and (max-width:500px){ - .top-stories-img{ - width:auto; - } -} - - -.top-stories-hed{ - font-weight:bold; - font-size:1.35em; - margin:10px 10px 0; - color:#00f; -} - -.top-stories-desc{ - font-size:1em; - padding-top:.5em; - margin:0 .75em; -} - -#middle-stories{ - clear:both; - width:500px; - margin:0 auto; - padding:0; - display:block; - overflow:auto; - float:left; -} - -@media only screen and (max-width:500px){ - #middle-stories{ - width:100%; - float:none; - } -} - -.middle-story{ - margin:5px 10px; - padding:10px; - background:#fff; - border:2px solid #ddd; - width:460px; - float:left; -} - -@media only screen and (max-width:500px){ - .middle-story{ - width:auto; - } -} - -.middle-stories-img{ - width:150px; - height:100px; - overflow:hidden; - background-size: auto 117px;/*cover;*/ - background-position: top center;/*center center;*/ - float:left; - max-width:35%; -} - -.middle-stories-hed{ - font-size:1.2em; - float:left; - width:300px; - margin-left:10px; - color:#00f; -} - -@media only screen and (max-width:500px){ - .middle-stories-hed{ - max-width:60%; - } -} - -#bottom-stories{ - margin:0 10px; - padding:10px; - display:block; - overflow:auto; - float:left; - width:350px; - border:5px solid #ddd; -} - -@media only screen and (max-width:900px){ - #bottom-stories{ - width:auto; - border-width:3px; - float:none; - } -} - -.bottom-story{ color:#00f; - - padding:15px 0; - color:#00f; -} - -#sources{ - clear:both; - padding-top:4em; - font-size:.8em; -} \ No newline at end of file diff --git a/main.py b/main.py deleted file mode 100755 index f1c3317..0000000 --- a/main.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import os - -from unbiasedObjects import * -from unbiasedFunctions import * -from parser import * -import time - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html') - args = parser.parse_args() - - while True: - print('-----------------------') - run(args.webroot) - print('-----------------------') - time.sleep(600) - -def run(webroot): - sourceList=[] - - ''' - - SOURCES TO ADD NEXT: - -ABC - -REUTERS - -Town Hall - - ''' - - print('running with webroot="{}"'.format(webroot)) - - - ### These values have to be the second half of the function name - ### E.g. Guardian calls buildGuardian(), etc. - sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', - 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze' - - for source in sourceFnArr: - tries=0 - while tries<3: - try: - fn='build'+source - possibles = globals().copy() - possibles.update(locals()) - method = possibles.get(fn) - src=method() - sourceList.append(src) - break - except: - print('Build error. Looping again: '+source) - tries+=1 - time.sleep(tries) - - #scrape all urls and build data structure - newsSourceArr=buildNewsSourceArr(sourceList) - - #build the output file HTML - outputHTML=buildOutput(newsSourceArr) - - #print the output file HTML - printOutputHTML(outputHTML, os.path.join(webroot, 'index.html')) - - -if __name__=="__main__": - main() diff --git a/parser.py b/parser.py deleted file mode 100755 index f69281b..0000000 --- a/parser.py +++ /dev/null @@ -1,986 +0,0 @@ -#!/usr/bin/env python3 - -from unbiasedObjects import * -from unbiasedFunctions import buildArticle -import os -import re - - -''' -Takes in a URL, downloads the file to a temp file, -reads the file into a string, and returns that string -''' -def urlToContent(url, sourceEncoding='utf8'): - #download file - os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) - - #read file - if sourceEncoding=='utf8': - f=open('scratch/temp1.html', 'r', encoding="utf8") - else: - f=open('scratch/temp1.html', 'r', encoding="latin-1") - content=f.read() - f.close() - - return content - - -''' -Creates a new newsSource2 object. For each URL in h1-h3URLs, -calls the file scraper and appends the new Article object. -Returns a newsSource2 object -''' -def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): - h1Arr=[] - a=buildArticle(h1URLs[0], name) - if a==None: - print('................\nH1 Nonetype in '+name+'\n................') - else: - h1Arr.append(a) - - h2Arr=[] - for x in h2URLs: - a=buildArticle(x, name) - if a!=None: - h2Arr.append(a) - else: - print('................\nH2 Nonetype in '+name+'\n................') - - - h3Arr=[] - for x in h3URLs: - a=buildArticle(x, name) - if a!=None: - h3Arr.append(a) - else: - print('................\nH3 Nonetype in '+name+'\n................') - - #BUILD THE NEWS SOURCE - newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) - - return newsSource - - -''' -Some sites will replicate URLs across the page. This function removes them. -Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; -if h2 exists in h1s, remove from h2s - -also check partial URLs (e.g. nytimes.com/story.html is the same as -nytimes.com/story.html?var=x -''' -def removeDuplicates(h1s, h2s, h3s): - #Assume h1s is one element, and keep it - - #remove h2 duplicates - removeArr=[] - for i in range(len(h2s)): - #check internally - for j in range(len(h2s)): - if i==j: - continue - else: - if h2s[i] in h2s[j]: - removeArr.append(h2s[j]) - #check against h1s - for k in range(len(h1s)): - if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): - removeArr.append(h2s[i]) - for x in removeArr: - h2s.remove(x) - - #remove h3 duplicates - removeArr=[] - for i in range(len(h3s)): - #check internally - for j in range(len(h3s)): - if i==j: - continue - else: - if h3s[i] in h3s[j]: - removeArr.append(h3s[j]) - #check against h1s and h2s - h1and2=h1s+h2s - for k in range(len(h1and2)): - if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): - removeArr.append(h3s[i]) - for x in removeArr: - if x in h3s: - h3s.remove(x) - - - return h1s, h2s, h3s - - - -def removalNotification(source, title, reason, value): - print('*************************') - print('\t\tSTORY REMOVED') - print('SOURCE: '+source) - print('TITLE: \t'+title) - print('REASON: '+reason) - print('VALUE: \t'+value) - print('*************************\n\n') - - -def removeBadStoriesHelper(source, element, badStringList, arr): - if badStringList!=None: - for i in range(len(arr)): - for hed in arr[i]: - if hed==None: - print("////////\nNone type found in removeBadStoriesHelper for "+source.name+"\n/////////") - break - for item in badStringList: - if item in getattr(hed, element): - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, element, item) - - -def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): - - arr=[source.h1Arr, source.h2Arr, source.h3Arr] - - removeBadStoriesHelper(source, "title", badTitleArr, arr) - removeBadStoriesHelper(source, "description", badDescArr, arr) - removeBadStoriesHelper(source, "author", badAuthorArr, arr) - removeBadStoriesHelper(source, "img", badImgArr, arr) - removeBadStoriesHelper(source, "url", badURLArr, arr) - - return source - - - - -def buildTheHill(): - url='http://thehill.com' - name='The Hill' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('
', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - while '
', 1)[1] - h3=h3.split('', 1)[0] - while '
')[2:] - for x in h2: - if '

', 1)[1] - h3=h3.split('
', 1)[0]#'', 1)[0] - while '
  • ' in h2: - h2=h2.split('
  • ', 1)[1] - h2=h2.split('', 1)[1] - h2=h2.split('
    ', 1)[1] - h3=h3.split('Watch/Listen', 1)[0] - while '
    ', 1)[1] - h1=h1.split('href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('
    ', 1)[1] - h2=h2.split('
    ' in h2: - h2=h2.split('
    ', 1)[1] - h2=h2.split('href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('Today\'s Standard', 1)[1] - h3=h3.split('
    ' in h3: - h3=h3.split('
    ', 1)[1] - h3=h3.split('href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(x) - - #Need to add URL prefix to all URLs - for i in range(len(h1s)): - h1s[i]=url+h1s[i] - for i in range(len(h2s)): - h2s[i]=url+h2s[i] - for i in range(len(h3s)): - h3s[i]=url+h3s[i] - - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wkl=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE BAD STORIES - badTitleArr=None - ## if flagged again, remove Micah Mattix - badDescArr=['Matt Labash'] - badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] - badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] - wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) - - return wkl - - - - -def buildNPR(): - url='http://www.npr.org/sections/news/' - name='NPR' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - while '
    ' in h2: - h2=h2.split('
    ', 1)[1] - h2=h2.split('', 1)[1] - h2=h2.split('
    ', 1)[0] - while '', 1)[1] - h1=h1.split('
    ', 1)[1] - h2=h2.split('', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h2.split('
    ', 1) - h2=arr[0]+arr[1].split('', 1)[1] - #Grab the remaining URLs - while '', 1)[1] - h3=h3.split('', 1)[0] - #remove "collection" sets - while '
    ' in h3: - arr=h3.split('
    ', 1) - h3=arr[0]+arr[1].split('', 1)[1] - #Grab the remaining URLs - while '' in h3: - h3=h3.split('', 1)[1] - h3=h3.split('', 1)[0] - elif '/video/the-daily-360' in h3: - h3=h3.split('/video/the-daily-360')[-1] - h3=h3.split('More News', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h3.split('
    ', 1) - h3=arr[0]+arr[1].split('', 1)[1] - - #Grab the remaining URLs - while ' - -
    - -
    -

    Top News

    - -
    - - -''' diff --git a/scratch/do_not_delete b/scratch/do_not_delete deleted file mode 100644 index e69de29..0000000 diff --git a/spotCheck.py b/spotCheck.py deleted file mode 100755 index d1edda4..0000000 --- a/spotCheck.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - - -from parser import * -from unbiasedObjects import * -import sys - -def spotCheck(src): - - fns = {'hil' : buildTheHill, - 'cbs' : buildCBS, - 'npr' : buildNPR, - 'fox' : buildFoxNews, - 'gdn' : buildGuardian, - 'blz' : buildBlaze, - 'bbc' : buildBBC, - 'nbc' : buildNBC, - 'wat' : buildWashTimes, - 'csm' : buildCSM, - 'abc' : buildABC} - - data=fns[src]() - - print('H1s:\n--------------') - for h in data.h1Arr: - print(h.title) - - print('\n\nH2s:\n--------------') - for h in data.h2Arr: - print(h.title) - - print('\n\nH3s:\n--------------') - for h in data.h3Arr: - print(h.title) - - print('\n\n') - - - -if __name__=='__main__': - spotCheck(sys.argv[1]) diff --git a/unbiased/html_template/newtemplate.html b/unbiased/html_template/newtemplate.html new file mode 100644 index 0000000..0cec766 --- /dev/null +++ b/unbiased/html_template/newtemplate.html @@ -0,0 +1,150 @@ + + + + + + UnBiased + + + + + + + +
    + Sources: BBC US, NBC News, CBS News, The Blaze, Weekly Standard, New York Times, Fox News +
    + + diff --git a/unbiased/html_template/template.html b/unbiased/html_template/template.html new file mode 100755 index 0000000..fc17006 --- /dev/null +++ b/unbiased/html_template/template.html @@ -0,0 +1,173 @@ + + + + + + + UnBiased + + + + + + + +
    + Sources: xxSourcesxx +
    + + diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css new file mode 100755 index 0000000..244f100 --- /dev/null +++ b/unbiased/html_template/unbiased.css @@ -0,0 +1,220 @@ +/*body{ + width:900px; + margin-left:auto; + margin-right:auto; +}*/ + + +body{ + margin:0; +} + +a:link, a:visited, a:hover, a:active { + color: #00f; + text-decoration:none; + } + +a:hover{ + cursor:pointer; +} + +#page-header{ + width:100%; + text-align:center; + padding:.5em 0 1em; + margin-bottom:1em; + border-bottom:3px solid #BB133E; + background:#002147; +} + +.title{ + font-size:3em; +} + +#title-1{ + font-style:italic; + color:#fff; +} + +#title-2{ + color:#fff; +} + +#subtitle{ + font-size:1.25em; + color:#ccc; +} + +#timestamp{ + margin:.5em 0 0 0; + font-size:.8em; + color:#cc6; +} + +#page-container{ + width:900px; + margin-left:auto; + margin-right:auto; +} + +@media only screen and (max-width:900px){ + #page-container{ + width:100% + } +} + +#top-stories{ + width:95%; + display:block; + overflow:auto; + padding:10px; + margin-left:auto; + margin-right:auto; + text-align:center; + border-bottom: 3px solid #BB133E; + margin-bottom: 10px; +} + +.row{ + display:flex; +} + +.top-story{ + display:inline-block; + vertical-align:top; + text-align:left; + width:360px; + height:auto; + overflow:hidden; + background:#fff; + margin:10px; + padding:10px; + border:2px solid #ccc; + flex:1; +} + +@media only screen and (max-width:500px){ + .row{ + display:block; + } + .top-story{ + display:block; + width:auto; + height:auto; + } +} + +.top-stories-img{ + width:350px; + height:200px; + overflow:hidden; + background-size: auto 234px;/*cover;*/ + background-position: top center;/*center center;*/ + margin:0 auto; +} + +@media only screen and (max-width:500px){ + .top-stories-img{ + width:auto; + } +} + + +.top-stories-hed{ + font-weight:bold; + font-size:1.35em; + margin:10px 10px 0; + color:#00f; +} + +.top-stories-desc{ + font-size:1em; + padding-top:.5em; + margin:0 .75em; +} + +#middle-stories{ + clear:both; + width:500px; + margin:0 auto; + padding:0; + display:block; + overflow:auto; + float:left; +} + +@media only screen and (max-width:500px){ + #middle-stories{ + width:100%; + float:none; + } +} + +.middle-story{ + margin:5px 10px; + padding:10px; + background:#fff; + border:2px solid #ddd; + width:460px; + float:left; +} + +@media only screen and (max-width:500px){ + .middle-story{ + width:auto; + } +} + +.middle-stories-img{ + width:150px; + height:100px; + overflow:hidden; + background-size: auto 117px;/*cover;*/ + background-position: top center;/*center center;*/ + float:left; + max-width:35%; +} + +.middle-stories-hed{ + font-size:1.2em; + float:left; + width:300px; + margin-left:10px; + color:#00f; +} + +@media only screen and (max-width:500px){ + .middle-stories-hed{ + max-width:60%; + } +} + +#bottom-stories{ + margin:0 10px; + padding:10px; + display:block; + overflow:auto; + float:left; + width:350px; + border:5px solid #ddd; +} + +@media only screen and (max-width:900px){ + #bottom-stories{ + width:auto; + border-width:3px; + float:none; + } +} + +.bottom-story{ color:#00f; + + padding:15px 0; + color:#00f; +} + +#sources{ + clear:both; + padding-top:4em; + font-size:.8em; +} \ No newline at end of file diff --git a/unbiased/main.py b/unbiased/main.py new file mode 100755 index 0000000..f1c3317 --- /dev/null +++ b/unbiased/main.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +import argparse +import os + +from unbiasedObjects import * +from unbiasedFunctions import * +from parser import * +import time + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html') + args = parser.parse_args() + + while True: + print('-----------------------') + run(args.webroot) + print('-----------------------') + time.sleep(600) + +def run(webroot): + sourceList=[] + + ''' + + SOURCES TO ADD NEXT: + -ABC + -REUTERS + -Town Hall + + ''' + + print('running with webroot="{}"'.format(webroot)) + + + ### These values have to be the second half of the function name + ### E.g. Guardian calls buildGuardian(), etc. + sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', + 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze' + + for source in sourceFnArr: + tries=0 + while tries<3: + try: + fn='build'+source + possibles = globals().copy() + possibles.update(locals()) + method = possibles.get(fn) + src=method() + sourceList.append(src) + break + except: + print('Build error. Looping again: '+source) + tries+=1 + time.sleep(tries) + + #scrape all urls and build data structure + newsSourceArr=buildNewsSourceArr(sourceList) + + #build the output file HTML + outputHTML=buildOutput(newsSourceArr) + + #print the output file HTML + printOutputHTML(outputHTML, os.path.join(webroot, 'index.html')) + + +if __name__=="__main__": + main() diff --git a/unbiased/parser.py b/unbiased/parser.py new file mode 100755 index 0000000..f69281b --- /dev/null +++ b/unbiased/parser.py @@ -0,0 +1,986 @@ +#!/usr/bin/env python3 + +from unbiasedObjects import * +from unbiasedFunctions import buildArticle +import os +import re + + +''' +Takes in a URL, downloads the file to a temp file, +reads the file into a string, and returns that string +''' +def urlToContent(url, sourceEncoding='utf8'): + #download file + os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + + #read file + if sourceEncoding=='utf8': + f=open('scratch/temp1.html', 'r', encoding="utf8") + else: + f=open('scratch/temp1.html', 'r', encoding="latin-1") + content=f.read() + f.close() + + return content + + +''' +Creates a new newsSource2 object. For each URL in h1-h3URLs, +calls the file scraper and appends the new Article object. +Returns a newsSource2 object +''' +def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): + h1Arr=[] + a=buildArticle(h1URLs[0], name) + if a==None: + print('................\nH1 Nonetype in '+name+'\n................') + else: + h1Arr.append(a) + + h2Arr=[] + for x in h2URLs: + a=buildArticle(x, name) + if a!=None: + h2Arr.append(a) + else: + print('................\nH2 Nonetype in '+name+'\n................') + + + h3Arr=[] + for x in h3URLs: + a=buildArticle(x, name) + if a!=None: + h3Arr.append(a) + else: + print('................\nH3 Nonetype in '+name+'\n................') + + #BUILD THE NEWS SOURCE + newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) + + return newsSource + + +''' +Some sites will replicate URLs across the page. This function removes them. +Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; +if h2 exists in h1s, remove from h2s + +also check partial URLs (e.g. nytimes.com/story.html is the same as +nytimes.com/story.html?var=x +''' +def removeDuplicates(h1s, h2s, h3s): + #Assume h1s is one element, and keep it + + #remove h2 duplicates + removeArr=[] + for i in range(len(h2s)): + #check internally + for j in range(len(h2s)): + if i==j: + continue + else: + if h2s[i] in h2s[j]: + removeArr.append(h2s[j]) + #check against h1s + for k in range(len(h1s)): + if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): + removeArr.append(h2s[i]) + for x in removeArr: + h2s.remove(x) + + #remove h3 duplicates + removeArr=[] + for i in range(len(h3s)): + #check internally + for j in range(len(h3s)): + if i==j: + continue + else: + if h3s[i] in h3s[j]: + removeArr.append(h3s[j]) + #check against h1s and h2s + h1and2=h1s+h2s + for k in range(len(h1and2)): + if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): + removeArr.append(h3s[i]) + for x in removeArr: + if x in h3s: + h3s.remove(x) + + + return h1s, h2s, h3s + + + +def removalNotification(source, title, reason, value): + print('*************************') + print('\t\tSTORY REMOVED') + print('SOURCE: '+source) + print('TITLE: \t'+title) + print('REASON: '+reason) + print('VALUE: \t'+value) + print('*************************\n\n') + + +def removeBadStoriesHelper(source, element, badStringList, arr): + if badStringList!=None: + for i in range(len(arr)): + for hed in arr[i]: + if hed==None: + print("////////\nNone type found in removeBadStoriesHelper for "+source.name+"\n/////////") + break + for item in badStringList: + if item in getattr(hed, element): + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, element, item) + + +def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): + + arr=[source.h1Arr, source.h2Arr, source.h3Arr] + + removeBadStoriesHelper(source, "title", badTitleArr, arr) + removeBadStoriesHelper(source, "description", badDescArr, arr) + removeBadStoriesHelper(source, "author", badAuthorArr, arr) + removeBadStoriesHelper(source, "img", badImgArr, arr) + removeBadStoriesHelper(source, "url", badURLArr, arr) + + return source + + + + +def buildTheHill(): + url='http://thehill.com' + name='The Hill' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('
    ', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '
    ', 1)[1] + h3=h3.split('', 1)[0] + while '
    ')[2:] + for x in h2: + if '

    ', 1)[1] + h3=h3.split('
    ', 1)[0]#'', 1)[0] + while '
  • ' in h2: + h2=h2.split('
  • ', 1)[1] + h2=h2.split('', 1)[1] + h2=h2.split('
    ', 1)[1] + h3=h3.split('Watch/Listen', 1)[0] + while '
    ', 1)[1] + h1=h1.split('href="', 1)[1] + h1=h1.split('"', 1)[0] + h1s=[h1] + + #GET SECONDARY HEADLINES + h2=content + h2s=[] + h2=h2.split('
    ', 1)[1] + h2=h2.split('
    ' in h2: + h2=h2.split('
    ', 1)[1] + h2=h2.split('href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('Today\'s Standard', 1)[1] + h3=h3.split('
    ' in h3: + h3=h3.split('
    ', 1)[1] + h3=h3.split('href="', 1)[1] + x=h3.split('"', 1)[0] + if h1 not in x: + h3s.append(x) + + #Need to add URL prefix to all URLs + for i in range(len(h1s)): + h1s[i]=url+h1s[i] + for i in range(len(h2s)): + h2s[i]=url+h2s[i] + for i in range(len(h3s)): + h3s[i]=url+h3s[i] + + + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) + wkl=buildNewsSource2(name, url, h1s, h2s, h3s) + + #REMOVE BAD STORIES + badTitleArr=None + ## if flagged again, remove Micah Mattix + badDescArr=['Matt Labash'] + badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] + badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] + wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) + + return wkl + + + + +def buildNPR(): + url='http://www.npr.org/sections/news/' + name='NPR' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '
    ' in h2: + h2=h2.split('
    ', 1)[1] + h2=h2.split('', 1)[1] + h2=h2.split('
    ', 1)[0] + while '', 1)[1] + h1=h1.split('
    ', 1)[1] + h2=h2.split('', 1)[0] + #remove "collection" sets + while '
    ' in h2: + arr=h2.split('
    ', 1) + h2=arr[0]+arr[1].split('', 1)[1] + #Grab the remaining URLs + while '', 1)[1] + h3=h3.split('', 1)[0] + #remove "collection" sets + while '
    ' in h3: + arr=h3.split('
    ', 1) + h3=arr[0]+arr[1].split('', 1)[1] + #Grab the remaining URLs + while '' in h3: + h3=h3.split('', 1)[1] + h3=h3.split('', 1)[0] + elif '/video/the-daily-360' in h3: + h3=h3.split('/video/the-daily-360')[-1] + h3=h3.split('More News', 1)[0] + #remove "collection" sets + while '
    ' in h2: + arr=h3.split('
    ', 1) + h3=arr[0]+arr[1].split('', 1)[1] + + #Grab the remaining URLs + while ' + +
    + +
    +

    Top News

    + +
    + + +''' diff --git a/unbiased/scratch/do_not_delete b/unbiased/scratch/do_not_delete new file mode 100644 index 0000000..e69de29 diff --git a/unbiased/spotCheck.py b/unbiased/spotCheck.py new file mode 100755 index 0000000..d1edda4 --- /dev/null +++ b/unbiased/spotCheck.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + + +from parser import * +from unbiasedObjects import * +import sys + +def spotCheck(src): + + fns = {'hil' : buildTheHill, + 'cbs' : buildCBS, + 'npr' : buildNPR, + 'fox' : buildFoxNews, + 'gdn' : buildGuardian, + 'blz' : buildBlaze, + 'bbc' : buildBBC, + 'nbc' : buildNBC, + 'wat' : buildWashTimes, + 'csm' : buildCSM, + 'abc' : buildABC} + + data=fns[src]() + + print('H1s:\n--------------') + for h in data.h1Arr: + print(h.title) + + print('\n\nH2s:\n--------------') + for h in data.h2Arr: + print(h.title) + + print('\n\nH3s:\n--------------') + for h in data.h3Arr: + print(h.title) + + print('\n\n') + + + +if __name__=='__main__': + spotCheck(sys.argv[1]) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py new file mode 100644 index 0000000..1a80d7a --- /dev/null +++ b/unbiased/unbiasedFunctions.py @@ -0,0 +1,259 @@ +from unbiasedObjects import * +import os +import random +import time +import re + + +#take in a url and delimiters, return twitter card +def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): + + debugging=False + if debugging: + print(sourceName) + print(url) + print() + + #download url + os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) + + #read the file in + f=open('scratch/temp_article.html', 'r', encoding="utf8") + content=f.read() + f.close() + + try: + if sourceName=='The Guardian': + #The Guardian puts an identifying banner on their og:images + #grab the main image from the page instead + + #scenario 1: regular image + if '')[0] + elif sourceName=='ABC News': + img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX' + if img[-1]=='/': + #because the quote separator could be ' or ", + #trim to just before it then lop it off + img=img[:-1].strip() + img=img[:-1] + + if debugging: + print(img) + + title=content.split('og:title" content=')[1][1:].split('>')[0] + if title[-1]=='/': + title=title[:-1].strip() + title=title[:-1] + + if debugging: + print(title) + + + author='' + if sourceName=='The Blaze': + if 'class="article-author">' in content: + author=content.split('class="article-author">')[1].split('<')[0] + elif 'class="article-author" href="' in content: + author=content.split('class="article-author" href="')[1] + author=author.split('>')[1].split('<')[0].strip() + else: + authorTags=['article:author', 'dc.creator', 'property="author'] + for tag in authorTags: + if tag in content: + author=content.split(tag+'" content=')[1][1:].split('>')[0] + author=author[:-1] + #trim an extra quotation mark for The Hill + if sourceName=='The Hill': + author=author.split('"', 1)[0] + break + + if debugging: + print(author) + + + if 'og:description' in content: + description=content.split('og:description" content=')[1][1:].split('>')[0] + if description[-1]=='/': + description=description[:-1].strip() + description=description[:-1] + else: + if sourceName=='The Hill': + description=content.split('div class="field-items"')[-1] + description=re.sub('<[^<]+?>', '', description) + description=description[1:200] + else: + print("SHOULDN'T GET HERE") + + #strip out self-references + description=description.replace(sourceName+"'s", '***') + description=description.replace(sourceName+"'", '***') + description=description.replace(sourceName, '***') + + if debugging: + print(description) + + + a=Article(title, url, img, description, sourceName, author) + return a + + except: + print('^^^^^^^^^^^^^^^^^^^^^^^^^') + print('\tARTICLE PARSING ERROR') + print('SOURCE: '+sourceName) + print('URL: \t'+url) + print('^^^^^^^^^^^^^^^^^^^^^^^^^ \n\n') + return None + + +def buildOutput(newsSourceArr): + #read in the template html file + f=open('html_template/template.html', 'r') + template=f.read() + f.close() + + #set the random order for sources + h1RandomSources=[] + while len(h1RandomSources)<4: + x=random.sample(range(len(newsSourceArr)), 1)[0] + if len(newsSourceArr[x].h1Arr)>0: + if x not in h1RandomSources: + h1RandomSources.append(x) + else: + print('\n\n@@@@\nNo H1 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') + + #For h2s and h3s, select N random sources (can repeat), then + #a non-repetitive random article from within + h2RandomPairs=[] + while len(h2RandomPairs) < 6: + x=random.sample(range(len(newsSourceArr)), 1)[0] + if len(newsSourceArr[x].h2Arr) > 0: + y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0] + pair=[x,y] + if not pair in h2RandomPairs: + h2RandomPairs.append(pair) + else: + print('\n\n@@@@\nNo H2 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') + + h3RandomPairs=[] + while len(h3RandomPairs) < 12: + x=random.sample(range(len(newsSourceArr)), 1)[0] + print(newsSourceArr[x].name) + if len(newsSourceArr[x].h3Arr) > 0: + y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0] + pair=[x,y] + if not pair in h3RandomPairs: + h3RandomPairs.append(pair) + else: + print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') + + #replace html template locations with data from newsSourceArr + for i in range(len(h1RandomSources)): + source=newsSourceArr[h1RandomSources[i]] + randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] + article=source.h1Arr[randomArticle] + template=template.replace('xxURL1-'+str(i+1)+'xx', article.url) + template=template.replace('xxTitle1-'+str(i+1)+'xx', article.title) + template=template.replace('xxImg1-'+str(i+1)+'xx', article.img) + desc=article.description + if len(desc)>144: + desc=desc[:141] + desc=desc.split()[:-1] + desc=' '.join(desc)+' ...' + template=template.replace('xxDesc1-'+str(i+1)+'xx', desc) + + for i in range(len(h2RandomPairs)): + pair=h2RandomPairs[i] + article=newsSourceArr[pair[0]].h2Arr[pair[1]] + template=template.replace('xxURL2-'+str(i+1)+'xx', article.url) + template=template.replace('xxTitle2-'+str(i+1)+'xx', article.title) + template=template.replace('xxImg2-'+str(i+1)+'xx', article.img) + + for i in range(len(h3RandomPairs)): + pair=h3RandomPairs[i] + article=newsSourceArr[pair[0]].h3Arr[pair[1]] + template=template.replace('xxURL3-'+str(i+1)+'xx', article.url) + template=template.replace('xxTitle3-'+str(i+1)+'xx', article.title) + template=template.replace('xxImg3-'+str(i+1)+'xx', article.img) + + + sourcesStr='' + for i in range(len(newsSourceArr)-1): + sourcesStr+=newsSourceArr[i].name+', ' + sourcesStr+=newsSourceArr[-1].name + print('Successfully parsed: '+sourcesStr) + template=template.replace('xxSourcesxx', sourcesStr) + + + #return updated text + return template + +def printOutputHTML(outputHTML, outFile): + timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) + outputHTML=outputHTML.replace('xxTimexx', timestamp) + + f=open(outFile, 'w') + f.write(outputHTML) + f.close() + +def buildNewsSourceArr(sourceList): + + #build the data structure + i=0 + listLen=len(sourceList) + while i < listLen: + source=sourceList[i] + + if type(source) is NewsSource2: + i+=1 + continue + + url=source.url + + #download file + os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) + + #read file + f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8") + content=f.read() + f.close() + + #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS + #os.remove('scratch/temp'+str(i)+'.html') + + #add stories etc to the NewsSource object + h1s, h2s, h3s=extractURLs(content, source) + + #build the Article objects and add to newsSource's appropriate list + if h1s!=None and h2s!=None: + for url in h1s: + article=buildArticle(url, source.name) + if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) + for url in h2s: + article=buildArticle(url, source.name) + if article!=None: sourceList[i].h2Arr.append(article) + for url in h3s: + article=buildArticle(url, source.name) + if article!=None: sourceList[i].h3Arr.append(article) + i+=1 + else: + sourceList.remove(source) + listLen-=1 + + + #return the original sourceList, + #since everything should have been modified in place + return sourceList diff --git a/unbiased/unbiasedObjects.py b/unbiased/unbiasedObjects.py new file mode 100644 index 0000000..3affbe6 --- /dev/null +++ b/unbiased/unbiasedObjects.py @@ -0,0 +1,90 @@ +class Article(): + title='' + url='' + img='' + description='' + source='' + author='' + + def __init__(self, title, url, img, description, source, author): + self.title=title + self.url=url + self.img=img + self.description=description + self.source=source + self.author=author + + def __str__(self): + return '-----------\n'+self.title+'\n'+self.author+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------' + + +class NewsSource2(): + name='' + url='' + h1Arr=[] + h2Arr=[] + h3Arr=[] + def __init__(self, name, url, h1Arr, h2Arr, h3Arr): + self.name=name + self.url=url + self.h1Arr=h1Arr + self.h2Arr=h2Arr + self.h3Arr=h3Arr + + + +class NewsSource(): + name='' + url='' + #multiple start values to step through file. end value default to '"' + h1SectionDividerStart=None + h1SectionDividerEnd=None + h1DelStart=[] + h1DelEnd='"' + h2SectionDividerStart=None + h2SectionDividerEnd=None + h2DelStart=[] + h2DelEnd='"' + h3SectionDividerStart=None + h3SectionDividerEnd=None + h3DelStart=[] + h3DelEnd='"' + #arrays of Article object types + h1Arr=None + h2Arr=None + h3Arr=None + #url to attach to stub links + stubURL='' + + def __init__(self, name, url, + h1DelStart, h2DelStart, h3DelStart, + h1SectionDividerStart=None, h1SectionDividerEnd=None, + h2SectionDividerStart=None, h2SectionDividerEnd=None, + h3SectionDividerStart=None, h3SectionDividerEnd=None, + stubURL=None): + self.name=name + self.url=url + self.h1DelStart=h1DelStart + self.h2DelStart=h2DelStart + self.h3DelStart=h3DelStart + self.h1SectionDividerStart=h1SectionDividerStart + self.h2SectionDividerStart=h2SectionDividerStart + self.h3SectionDividerStart=h3SectionDividerStart + self.h1SectionDividerEnd=h1SectionDividerEnd + self.h2SectionDividerEnd=h2SectionDividerEnd + self.h3SectionDividerEnd=h3SectionDividerEnd + self.h1Arr=[] + self.h2Arr=[] + self.h3Arr=[] + self.stubURL=stubURL + + def addArticle(self, article, level): + if level==1: + self.h1Arr.append(article) + elif level==2: + self.h2Arr.append(article) + elif level==3: + self.h3Arr.append(article) + else: + print("Error: invalid level in NewsSource.addArtlce: ", level) + diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py deleted file mode 100644 index 1a80d7a..0000000 --- a/unbiasedFunctions.py +++ /dev/null @@ -1,259 +0,0 @@ -from unbiasedObjects import * -import os -import random -import time -import re - - -#take in a url and delimiters, return twitter card -def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): - - debugging=False - if debugging: - print(sourceName) - print(url) - print() - - #download url - os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) - - #read the file in - f=open('scratch/temp_article.html', 'r', encoding="utf8") - content=f.read() - f.close() - - try: - if sourceName=='The Guardian': - #The Guardian puts an identifying banner on their og:images - #grab the main image from the page instead - - #scenario 1: regular image - if '')[0] - elif sourceName=='ABC News': - img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX' - if img[-1]=='/': - #because the quote separator could be ' or ", - #trim to just before it then lop it off - img=img[:-1].strip() - img=img[:-1] - - if debugging: - print(img) - - title=content.split('og:title" content=')[1][1:].split('>')[0] - if title[-1]=='/': - title=title[:-1].strip() - title=title[:-1] - - if debugging: - print(title) - - - author='' - if sourceName=='The Blaze': - if 'class="article-author">' in content: - author=content.split('class="article-author">')[1].split('<')[0] - elif 'class="article-author" href="' in content: - author=content.split('class="article-author" href="')[1] - author=author.split('>')[1].split('<')[0].strip() - else: - authorTags=['article:author', 'dc.creator', 'property="author'] - for tag in authorTags: - if tag in content: - author=content.split(tag+'" content=')[1][1:].split('>')[0] - author=author[:-1] - #trim an extra quotation mark for The Hill - if sourceName=='The Hill': - author=author.split('"', 1)[0] - break - - if debugging: - print(author) - - - if 'og:description' in content: - description=content.split('og:description" content=')[1][1:].split('>')[0] - if description[-1]=='/': - description=description[:-1].strip() - description=description[:-1] - else: - if sourceName=='The Hill': - description=content.split('div class="field-items"')[-1] - description=re.sub('<[^<]+?>', '', description) - description=description[1:200] - else: - print("SHOULDN'T GET HERE") - - #strip out self-references - description=description.replace(sourceName+"'s", '***') - description=description.replace(sourceName+"'", '***') - description=description.replace(sourceName, '***') - - if debugging: - print(description) - - - a=Article(title, url, img, description, sourceName, author) - return a - - except: - print('^^^^^^^^^^^^^^^^^^^^^^^^^') - print('\tARTICLE PARSING ERROR') - print('SOURCE: '+sourceName) - print('URL: \t'+url) - print('^^^^^^^^^^^^^^^^^^^^^^^^^ \n\n') - return None - - -def buildOutput(newsSourceArr): - #read in the template html file - f=open('html_template/template.html', 'r') - template=f.read() - f.close() - - #set the random order for sources - h1RandomSources=[] - while len(h1RandomSources)<4: - x=random.sample(range(len(newsSourceArr)), 1)[0] - if len(newsSourceArr[x].h1Arr)>0: - if x not in h1RandomSources: - h1RandomSources.append(x) - else: - print('\n\n@@@@\nNo H1 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') - - #For h2s and h3s, select N random sources (can repeat), then - #a non-repetitive random article from within - h2RandomPairs=[] - while len(h2RandomPairs) < 6: - x=random.sample(range(len(newsSourceArr)), 1)[0] - if len(newsSourceArr[x].h2Arr) > 0: - y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0] - pair=[x,y] - if not pair in h2RandomPairs: - h2RandomPairs.append(pair) - else: - print('\n\n@@@@\nNo H2 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') - - h3RandomPairs=[] - while len(h3RandomPairs) < 12: - x=random.sample(range(len(newsSourceArr)), 1)[0] - print(newsSourceArr[x].name) - if len(newsSourceArr[x].h3Arr) > 0: - y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0] - pair=[x,y] - if not pair in h3RandomPairs: - h3RandomPairs.append(pair) - else: - print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') - - #replace html template locations with data from newsSourceArr - for i in range(len(h1RandomSources)): - source=newsSourceArr[h1RandomSources[i]] - randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] - article=source.h1Arr[randomArticle] - template=template.replace('xxURL1-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle1-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg1-'+str(i+1)+'xx', article.img) - desc=article.description - if len(desc)>144: - desc=desc[:141] - desc=desc.split()[:-1] - desc=' '.join(desc)+' ...' - template=template.replace('xxDesc1-'+str(i+1)+'xx', desc) - - for i in range(len(h2RandomPairs)): - pair=h2RandomPairs[i] - article=newsSourceArr[pair[0]].h2Arr[pair[1]] - template=template.replace('xxURL2-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle2-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg2-'+str(i+1)+'xx', article.img) - - for i in range(len(h3RandomPairs)): - pair=h3RandomPairs[i] - article=newsSourceArr[pair[0]].h3Arr[pair[1]] - template=template.replace('xxURL3-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle3-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg3-'+str(i+1)+'xx', article.img) - - - sourcesStr='' - for i in range(len(newsSourceArr)-1): - sourcesStr+=newsSourceArr[i].name+', ' - sourcesStr+=newsSourceArr[-1].name - print('Successfully parsed: '+sourcesStr) - template=template.replace('xxSourcesxx', sourcesStr) - - - #return updated text - return template - -def printOutputHTML(outputHTML, outFile): - timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) - outputHTML=outputHTML.replace('xxTimexx', timestamp) - - f=open(outFile, 'w') - f.write(outputHTML) - f.close() - -def buildNewsSourceArr(sourceList): - - #build the data structure - i=0 - listLen=len(sourceList) - while i < listLen: - source=sourceList[i] - - if type(source) is NewsSource2: - i+=1 - continue - - url=source.url - - #download file - os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) - - #read file - f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8") - content=f.read() - f.close() - - #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS - #os.remove('scratch/temp'+str(i)+'.html') - - #add stories etc to the NewsSource object - h1s, h2s, h3s=extractURLs(content, source) - - #build the Article objects and add to newsSource's appropriate list - if h1s!=None and h2s!=None: - for url in h1s: - article=buildArticle(url, source.name) - if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) - for url in h2s: - article=buildArticle(url, source.name) - if article!=None: sourceList[i].h2Arr.append(article) - for url in h3s: - article=buildArticle(url, source.name) - if article!=None: sourceList[i].h3Arr.append(article) - i+=1 - else: - sourceList.remove(source) - listLen-=1 - - - #return the original sourceList, - #since everything should have been modified in place - return sourceList diff --git a/unbiasedObjects.py b/unbiasedObjects.py deleted file mode 100644 index 3affbe6..0000000 --- a/unbiasedObjects.py +++ /dev/null @@ -1,90 +0,0 @@ -class Article(): - title='' - url='' - img='' - description='' - source='' - author='' - - def __init__(self, title, url, img, description, source, author): - self.title=title - self.url=url - self.img=img - self.description=description - self.source=source - self.author=author - - def __str__(self): - return '-----------\n'+self.title+'\n'+self.author+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------' - - -class NewsSource2(): - name='' - url='' - h1Arr=[] - h2Arr=[] - h3Arr=[] - def __init__(self, name, url, h1Arr, h2Arr, h3Arr): - self.name=name - self.url=url - self.h1Arr=h1Arr - self.h2Arr=h2Arr - self.h3Arr=h3Arr - - - -class NewsSource(): - name='' - url='' - #multiple start values to step through file. end value default to '"' - h1SectionDividerStart=None - h1SectionDividerEnd=None - h1DelStart=[] - h1DelEnd='"' - h2SectionDividerStart=None - h2SectionDividerEnd=None - h2DelStart=[] - h2DelEnd='"' - h3SectionDividerStart=None - h3SectionDividerEnd=None - h3DelStart=[] - h3DelEnd='"' - #arrays of Article object types - h1Arr=None - h2Arr=None - h3Arr=None - #url to attach to stub links - stubURL='' - - def __init__(self, name, url, - h1DelStart, h2DelStart, h3DelStart, - h1SectionDividerStart=None, h1SectionDividerEnd=None, - h2SectionDividerStart=None, h2SectionDividerEnd=None, - h3SectionDividerStart=None, h3SectionDividerEnd=None, - stubURL=None): - self.name=name - self.url=url - self.h1DelStart=h1DelStart - self.h2DelStart=h2DelStart - self.h3DelStart=h3DelStart - self.h1SectionDividerStart=h1SectionDividerStart - self.h2SectionDividerStart=h2SectionDividerStart - self.h3SectionDividerStart=h3SectionDividerStart - self.h1SectionDividerEnd=h1SectionDividerEnd - self.h2SectionDividerEnd=h2SectionDividerEnd - self.h3SectionDividerEnd=h3SectionDividerEnd - self.h1Arr=[] - self.h2Arr=[] - self.h3Arr=[] - self.stubURL=stubURL - - def addArticle(self, article, level): - if level==1: - self.h1Arr.append(article) - elif level==2: - self.h2Arr.append(article) - elif level==3: - self.h3Arr.append(article) - else: - print("Error: invalid level in NewsSource.addArtlce: ", level) - -- cgit v1.2.3