From 3126b094485af5a374d0ba400604a8b2805337af Mon Sep 17 00:00:00 2001 From: ssstvinc2 Date: Thu, 9 Feb 2017 10:18:18 -0500 Subject: VIM is fucking garbage --- .gitignore | 4 + html_template/Penguins.jpg | Bin 0 -> 777835 bytes html_template/template.html | 113 + html_template/unbiased.css | 88 + legacy_py/downloadFiles.py | 24 + legacy_py/get_sources.py | 149 + legacy_py/update_stories.py | 10 + main.py | 65 + scratch/temp0.html | 4955 +++++++++++++++++++++++++++++++ scratch/temp1.html | 1321 +++++++++ scratch/temp2.html | 6766 +++++++++++++++++++++++++++++++++++++++++++ scratch/temp3.html | 4040 ++++++++++++++++++++++++++ scratch/temp_article.html | 950 ++++++ unbiased.html | 110 + unbiasedFunctions.py | 248 ++ unbiasedObjects.py | 67 + 16 files changed, 18910 insertions(+) create mode 100644 .gitignore create mode 100755 html_template/Penguins.jpg create mode 100755 html_template/template.html create mode 100755 html_template/unbiased.css create mode 100644 legacy_py/downloadFiles.py create mode 100755 legacy_py/get_sources.py create mode 100755 legacy_py/update_stories.py create mode 100755 main.py create mode 100644 scratch/temp0.html create mode 100644 scratch/temp1.html create mode 100644 scratch/temp2.html create mode 100644 scratch/temp3.html create mode 100644 scratch/temp_article.html create mode 100644 unbiased.html create mode 100644 unbiasedFunctions.py create mode 100644 unbiasedObjects.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d5eed0c --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.pyc +*~ +__pycache__/ +scratch/* \ No newline at end of file diff --git a/html_template/Penguins.jpg b/html_template/Penguins.jpg new file mode 100755 index 0000000..030ab8a Binary files /dev/null and b/html_template/Penguins.jpg differ diff --git a/html_template/template.html b/html_template/template.html new file mode 100755 index 0000000..7869155 --- /dev/null +++ b/html_template/template.html @@ -0,0 +1,113 @@ + + + + + + + UnBiased + + + + + +
+ + + + +
+
+ +
+ + diff --git a/html_template/unbiased.css b/html_template/unbiased.css new file mode 100755 index 0000000..83bae40 --- /dev/null +++ b/html_template/unbiased.css @@ -0,0 +1,88 @@ +#page-header{ + text-align:center; + padding:.5em 0 1em; + margin-bottom:1em; + border-bottom:1px solid #000; +} + +.title{ + font-size:3em; +} + +#title-1{ + font-style:italic; + color:#d00; +} + +#title-2{ + color:#00d; +} + +#subtitle{ + font-size:1.25em; +} + +#timestamp{ + margin:.5em 0 0 0; + font-size:.8em; +} + +#page-container{ + width:1150px; + padding:0 1em; + margin-left:auto; + margin-right:auto; +} + +#top-stories{ + width:1150px; + margin-left:auto; + margin-right:auto; + font-size:1.25em; +} + +.top-story{ + width:350px; + float:left; + margin:0 .5em; +} + +.top-stories-img{ + width:350px; + height:200px; + overflow:hidden; +} + +.top-stories-img img{ + width:100%; + display:block; + vertical-align:text-bottom; +} + +#middle-stories{ + clear:both; + width:1000px; + margin:0 auto; +} + +.middle-story{ + margin:2em 5px; + width:45%; + float:left; + height:100px; +} + +.middle-story img{ + vertical-align:middle; + height:100px; + float:left; + margin-right:1em; +} + +.middle-stories-hed{ + font-size:1.1em; +} + +.middle-story p{ + display:block; +} \ No newline at end of file diff --git a/legacy_py/downloadFiles.py b/legacy_py/downloadFiles.py new file mode 100644 index 0000000..9a79e27 --- /dev/null +++ b/legacy_py/downloadFiles.py @@ -0,0 +1,24 @@ +import os + +files=[] +files.append('http://www.nytimes.com') +files.append('http://www.nbcnews.com') +files.append('http://www.foxnews.com') + +stories=[] +stories.append('https://www.nytimes.com/2017/02/03/business/dealbook/trump-congress-financial-regulations.html') +stories.append('http://www.nbcnews.com/news/us-news/over-100-000-visas-have-been-revoked-immigration-ban-justice-n716121') +stories.append('http://www.foxnews.com/politics/2017/02/03/calls-mount-for-trump-administration-to-label-muslim-brotherhood-terrorist-organization.html') + +for i in range(len(files)): + os.system('wget -O source'+str(i+1)+'.html --no-check-certificate '+files[i]) + +for i in range(len(stories)): + os.system('wget -O story'+str(i+1)+'.html --no-check-certificate '+stories[i]) + +''' +f=open('testOut.html', 'r', encoding="utf8") +content=f.read() +f.close() +#os.remove('testOut.html') +''' diff --git a/legacy_py/get_sources.py b/legacy_py/get_sources.py new file mode 100755 index 0000000..cb23f61 --- /dev/null +++ b/legacy_py/get_sources.py @@ -0,0 +1,149 @@ +from bs4 import BeautifulSoup +import wget +import os + +def getHeds(heds, h_class=None): + hed_text=[] + hed_hrefs=[] + #print(heds[0]) + for hed in heds: + #print('*'+hed.string) + if hed.a: + try: + if h_class==None or hed['class'][0]==h_class: + if hed.a.string!=None: hed_text.append(hed.a.string.strip()) + if hed.a['href']!='': hed_hrefs.append(hed.a['href']) + except: + continue + return hed_text, hed_hrefs + + + +def get_nyt(): + #init vars + stories={} + + #download and soup web page + wget.download('https://nytimes.com', out="nyt.html") + soup = BeautifulSoup(open("nyt.html", encoding="utf8"), "lxml") + os.remove('nyt.html') + + #get top story + h1s=soup('h1') + h1_heds, h1_href = getHeds(h1s, 'story-heading') + stories['h1']={'heds':[h1_heds[0]], 'href':[h1_href[0]]} + + #get secondary stories + aCol = soup.find_all('div', 'a-column') + h2s=aCol[0].find_all('h2') + h2_heds, h2_href = getHeds(h2s, 'story-heading') + stories['h2']={'heds':h2_heds, 'href':h2_href} + + #get tertiary stories + bCol = soup.find_all('div', 'b-column') + h3s=bCol[0].find_all('h2') + h3_heds, h3_href = getHeds(h3s, 'story-heading') + stories['h3']={'heds':h3_heds, 'href':h3_href} + + return stories + + +def get_fox(): + #init vars + stories={} + h2_heds = [] + h2_href = [] + + #download and soup web page + wget.download('http://www.foxnews.com', out="fox.html") + soup = BeautifulSoup(open("fox.html", encoding="utf8"), "lxml") + os.remove('fox.html') + + #get top story + h1s=soup('h1') + h1_heds, h1_href = getHeds(h1s) + stories['h1']={'heds':[h1_heds[0]], 'href':[h1_href[0]]} + + #get secondary stories -
  • (loop for first ) + topStories = soup('div', 'top-stories') + topStoriesCols = topStories[0].ul + for c in topStoriesCols.children: + if c.string==None: + h2_heds.append(c.h3.text) + h2_href.append(c.a['href']) + + stories['h2']={'heds':h2_heds, 'href':h2_href} + + #get tertiary stories + h3s=topStoriesCols.find_all('li') + h3_heds, h3_href=getHeds(h3s) + for href in h3_href: + if href in h2_href: + h3_href.remove(href) + + stories['h3']={'heds':h3_heds, 'href':h3_href} + + return stories + + +def get_nbc(): + #init vars + stories={} + + #download and soup web page + wget.download('http://www.nbcnews.com', out="nbc.html") + soup = BeautifulSoup(open("nbc.html", encoding="utf8"), "lxml") + os.remove('nbc.html') + + #get top story + panel = soup.find_all('div', 'panel-txt_hero') + h1_heds=panel[0].find_all('h3') + panel = soup.find_all('div', 'panel-txt') + stories['h1']={'heds':[h1_heds[0].text.strip()], 'href':['http://www.nbcnews.com'+panel[0].a['href']]} + + #get secondary stories - div class panel + h2s = soup.find_all('div', 'story-link_default-height') + story_heds=[] + for item in h2s: + story_heds.append(item.h3.text.strip()) + h2_heds, h2_href = getHeds(h2s) + for i in range(len(h2_href)): + h2_href[i]='http://www.nbcnews.com'+h2_href[i] + stories['h2']={'heds':story_heds[:3], 'href':h2_href[:3]} + + #get tertiary stories - div class story-link + stories['h3']={'heds':story_heds[3:], 'href':h2_href[3:]} + + return stories + +def getTwitterCard(url): + card={} + + #wget.download(url, out="card.html") + cmd='wget --no-check-certificate '+url+' -O card.html' + print(cmd) + ret=os.system(cmd) + print(ret) + soup = BeautifulSoup(open("card.html", encoding="utf8"), "lxml") + #os.remove('card.html') + + hed=soup.find_all('meta', {'name' : 'twitter:title'}) + card['hed']=hed[0]['content'] + + img=soup.find_all('meta', {'name' : 'twitter:image'}) + card['img']=img[0]['content'] + + return card + + + +''' +links=get_nbc() +card=getTwitterCard(links['h2']['href'][0]) +print(card) + + +nyt=get_nyt() +fox=get_fox() +nbc=get_nbc() +''' diff --git a/legacy_py/update_stories.py b/legacy_py/update_stories.py new file mode 100755 index 0000000..4340148 --- /dev/null +++ b/legacy_py/update_stories.py @@ -0,0 +1,10 @@ +from get_sources import * + +nyt=get_nyt() +#fox=get_fox() +#nbc=get_nbc() + +url=nyt['h2']['href'][0] +print(url) +card=getTwitterCard("https://www.nytimes.com/2017/01/30/us/politics/trump-immigration-ban-memo.html") +#print(card) diff --git a/main.py b/main.py new file mode 100755 index 0000000..19fe8b0 --- /dev/null +++ b/main.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +from unbiasedObjects import * +from unbiasedFunctions import * +import time + +def main(): + while True: + print('-----------------------') + run() + print('-----------------------') + time.sleep(120) + +def run(): + sourceList=[] + sourceList.append(NewsSource('New York Times', + 'http://nytimes.com', + ['', '', + 'section id="top-news" class="top-news"', '
  • ', + 'class="second-column-region region"', 'html.geo-dma-501 .nythpNYRegionPromo')) + + sourceList.append(NewsSource('Fox News', + 'http://foxnews.com', + ['

    ', '
    ', + None, None)) + + #scrape all urls and build data structure + newsSourceArr=buildNewsSourceArr(sourceList) + + #build the output file HTML + outputHTML=buildOutput(newsSourceArr) + #print the output file HTML + printOutputHTML(outputHTML, '/var/www/html/index.html')#'unbiased.html') + + +if __name__=="__main__": + main() diff --git a/scratch/temp0.html b/scratch/temp0.html new file mode 100644 index 0000000..6441dae --- /dev/null +++ b/scratch/temp0.html @@ -0,0 +1,4955 @@ + + + + + + + The New York Times - Breaking News, World News & Multimedia + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + +
    +
    + + + + +
    +
    + + + + + + + + + +
    +
    + +
    + +
    + +
    +

    Top News

    + + + + +
    + +
    + +
    + + +
    +
    + +

    Gorsuch Calls Trump Critique of Federal Court ‘Demoralizing’

    + + + +

    • President Trump called a federal court hearing of his immigration order “disgraceful” and said the judges failed to grasp concepts that “a bad high school student would understand.”
    • Mr. Trump’s nominee for the Supreme Court, Neil M. Gorsuch, said the president’s attack was “demoralizing” and “disheartening.”

    + +

    +  Comments +

    + +
    + + +
    + +
    +
    + +

    Sessions Confirmed as Partisan Rancor Racks the Senate

    + + + +

    The 52-to-47 vote ended a racially charged battle over Senator Jeff Sessions’s nomination as attorney general. The debate was dominated by the Senate’s formal rebuke of Senator Elizabeth Warren.

    + +

    +  Comments +

    + +
    +
    +
    + +
    + + + + + + + + + +
    + + +
    + +
    +

    Trump

    +

    Obama

    +

    W. Bush

    +

    Clinton

    +

    Bush

    +
    +
    +
    +
    + + + +
    +
    + + + +
    + +
    + + +
    +
    + +

    Pentagon Unit Proposed Secret Prison in 2002

    + + + +

    The proposal was presented in a 2002 memo written in part by one of the psychologists who eventually helped create the C.I.A.’s “enhanced interrogation” program.

    + + +
    +
    +
    +
    + + + + +
    +
    +
    +

    Got a confidential news tip?

    +

    The New York Times offers several ways to get in touch with and provide materials to our journalists. Learn more.

    +
    +
    +
    + + +
    + +
    + +
    + +
    + +
    + +
    + +
    +
    + +
    + +
    + +
    + + +
    + +
    + + + +
    +
    +
    + +
    +
    + +
    +
    + + +

    Your Thursday Briefing

    + + + +

    + Here’s what you need to know to start your day.

    + + + +
    +
    +
    + +
    +
    + + +
    + + + +
    +
    + + + + +
    +
    + +
    +

    Audio

    +

    + + Listen to ‘The Daily’ + +

    +

    Is Senator Warren actually a danger to the Democratic Party? And what does Donald Trump’s election mean for the markets? We discuss the issues.

    +
    + Audio +
    +
    +
    +
    +
    +
    +
    +
    + +

    Personal Health

    +

    What Dizziness Can Actually Mean

    + +
    + +
    + + + +

    + People use the word dizziness when referring to lightheadedness, unsteadiness, motion intolerance, imbalance, floating or a tilting sensation.

    + +

    +  Comments +

    + +
    + +
    +
    + + + + +
    +
    + + +
    +
    +
    + +

    The Secrets of Jacqueline Kennedy’s Heart

    + +
    + +
    + + + +

    + Among recently discovered papers were letters from Mrs. Kennedy to David Ormsby Gore, including an explanation of her decision to marry Aristotle Onassis instead of him.

    + + +
    +
    +
    + + + +
    +
    + + +

    Can Raf Simons Reinvent American Fashion?

    + + + +
    + +
    + +

    + One of the most respected designers of his generation is in New York to reboot Calvin Klein, and the fashion world is cheering.

    + + +
    + +
    +
    + +
    +
    + +
    +
    +
    +
    + +
    +
    + +
    +
    + + +

    California Today

    + + +

    + The news and stories that matter to Californians (and anyone else interested in the state). Sign up to get it by email.

    + + + +
    +
    + +
    + +
    + +
    + +
    + + +
    + +
    + + +
    +
    +
    +

    +

    +
    + +
    +
    + +
    +
    + +
    +
    +
    + +
    + +
    + +
    + + + +
    + + +
    + +
    + +
    + +
    + +
    + +
    +
    +
    +
    + + +
    +
    + +
    +
    + +

    The Grandeur of the Courts

    + + + +

    + Oral arguments on the president’s travel ban reminded America what the rule of law looks like.

    + +

    +  Comments +

    + + +
    +
    + +
    +
    + +
    +
    +
    + +
    + + +
    +
    + +
    + +
    +
    +
    +
    + +
    + +
    + +

    User Subscriptions

    + + + + + +
    + +
    + +
    + + + +
    + +
    + + +
    + +
    +
      +
      +

      Stories you save are added to your Reading List.

      +

      Here are some suggestions to get started.

      +
      + +
      + +
      + + + + + + +
      +
      +
      +

      Watching

      +
      +
      +
      +
      +
      + + + + +
      + +
      + +
      + + +
      + +
      + + +
      + +
      + +
      + + + + +
      +
      +
      +
      Loading...
      +
      + +
      +
      + +
      + +
      + + + +
      + +
      + +
      + +
      +
      +

      Sections

      + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      + +
      + +
      + + +
      +
      +
      +
      + +
      +

      + Real Estate » +

      + + +
      + + +
      +
      +
      +
      +
      +
      +
      +
      + +
      +
      +
      +
      + + +
      + + +
      +
      +
      +
      +
      +
      + + + +
      +
      Loading...
      +
      +
      + + + + +
      + +
      + + + + +
      +
      +
      +
      +

      Go to Home Page »

      +

      + Site Index + + The New York Times + +

      + +
      + + + +
      + + +
      +
      + + + + + + + + + + + + + + + diff --git a/scratch/temp1.html b/scratch/temp1.html new file mode 100644 index 0000000..ceb1dd2 --- /dev/null +++ b/scratch/temp1.html @@ -0,0 +1,1321 @@ + + + + + + + + + + Fox News - Breaking News Updates | Latest News Headlines | Photos & News Videos + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + +
      + + +
      + +
      + + + + + + +
      +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +

      Latest News

      + + + + + +
      + + + +
      + + +
      + +
      +
      +
      + 64° +
      +
      +
      +
      +
      + + +
      +
      + Close + New York, NY +
      +
      +
      +

      Detailed Forecast

      +
      +
      + + + + +
      +

      Watch Now

      +
        +
        + +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +

        Features & Faces

        + +
        + + +
        +
        +

        Markets

        + + +
        + +
        + + +
        +
        +

        Watch Fox News

        + Fox News Go +
        +
        +
        +
        +
        + Now Playing +

         

        +
        +

        Full Schedule

        +
        +
        + + + +
        + +
        +

        Opinion

        +

        Continue to Opinion

        +
        +
        +

        Sean Hannity: Phony Native American Liz Warren leads Dems twisted anti-Trump charge

        + +
        +
        +

        America's kids got more stupid in reading, math and science while Team Obama was in charge

        + +
        +
        +

        Andrew Napolitano: What every American wants to know about federal judges

        + +
        +
        +

        We have a national security crisis: Let's do nothing

        + +
        +
        + +
        + +
        +

        Health

        +

        Continue to Health

        +
        +
        + +
        +

        Woman who lost memory had to fall for boyfriend twice

        +
        +
        +
        + +
        +

        Pennsylvania school disctrict sued over lead levels in water

        +
        +
        +
        + +
        +

        Self-administered expanders allow breast cancer patients to heal at home

        +
        +
        +
        + +
        +

        I was thin, fit— and had a heart attack at 28

        +
        +
        +
        + + + +
        +
        +
        + + + + + + +
        + +
        +

        The Property Project

        +

        Continue to The Property Project

        +
        +
        + +
        +

        These 8 home gyms will make you want to work out

        +
        +
        +
        + +
        +

        10 Messy Habits to Break for a Cleaner Home

        +
        +
        +
        + +
        +

        How to Declutter Your Garden Using the KonMari Method

        +
        +
        +
        + +
        +

        How to Grow the Edible Garden That's Right for You

        +
        +
        +
        + +
        + +
        +

        Food & Drink

        +

        Continue to Food & Drink

        +
        + +
        + +
        +

        Study finds new bacterial strain can contaminate shellfish

        +
        +
        + +
        + +
        +

        Lena Dunham has a tattoo of her favorite restaurant in a very private place

        +
        +
        +
        + + + + + + + + +
        + +

        What to Watch

        + + + + +
        + +
        + +
        +

        Technology

        +

        Continue to Technology

        +
        +
        + +
        +

        President Trump gives Twitter a boost, analyst says

        +
        +
        +
        + +
        +

        Wikipedia bans editors from citing Daily Mail as source

        +
        +
        + +
        + +
        +

        New research suggests anyone can become an Internet troll

        +
        +
        +
        + +
        +
        +

        Regional News

        +

         

        +
        +
        +
        + + +
        +
        +
        + Close +
        +
          +
        +
        + +
        + +
        + + +
        + +
        +

        Entertainment

        +

        Continue to Entertainment

        +
        +
        + +
        +

        Kelly Ripa's bedroom confession: Husband Mark Consuelos is 'mean' after doing the deed

        +
        +
        +
        + +
        +

        Christie Brinkley's ex romancing 21-year-old college student?

        +
        +
        +
        + +
        +

        Oprah Winfrey sells painting for $150M

        +
        +
        +
        + +
        +

        YouTube singer's estate sues Beyonce for $20M in copyright infringement suit

        +
        +
        +
        + +
        + +
        +

        Business Leaders

        +

        Continue to Business Leaders

        +
        +
        + +
        +

        Why the Stock Market Under Trump Will Probably Lag the Market Under Obama

        +
        +
        +
        + +
        +

        Goldman Sachs Adds Investment-Management Head to Partnership Committee

        +
        +
        +
        + +
        +

        Dr. Alveda King: Sen. Warren Used the King Name to Stir Up Emotions

        +
        +
        +
        + +
        +

        Harley-Davidson CEO on Trump Meeting, Corporate Tax Reform

        +
        +
        +
        + +
        + +
        +
        +
        +

        Top Slideshows & Lists

        +
        +
        + + + + + +
        + +
        +

        Science

        +

        Continue to Science

        +
        + +
        +
        +

        Digging History

        +

        View all

        +
        + +
        +

        She was one of George Washington's slaves, until she managed to escape

        +
        +
        +
        +
        +

        Air & Space

        +

        View all

        +
        + +
        +

        First atomic blast reveals clues about moon formation

        +
        +
        + +
        + +
        + +
        +

        US

        +

        Continue to US

        +
        +
        + +
        +

        Protests erupt outside Phoenix ICE office after arrest of illegal immigrant

        +
        +
        +
        + +
        +

        US probes plane that got too close to Air Force One, report says

        +
        +
        +
        + +
        +

        Major snowstorm bears down on Northeast; 3,000 flights canceled

        +
        +
        +
        + +
        +

        Oakland's incoming police chief to receive historic salary

        +
        +
        +
        + + + + +
        + + + + + +
        + + + + + + + diff --git a/scratch/temp2.html b/scratch/temp2.html new file mode 100644 index 0000000..d35dcd6 --- /dev/null +++ b/scratch/temp2.html @@ -0,0 +1,6766 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + NBC News - Breaking News & Top Stories - Latest World, US & Local News + + + + + + + + + + + + + + + + + + + + + +
        + + +
        + +
        +
        +
        +
        +
        +
        + +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +

        Top Stories

        +
        +
        + + +
        +
        +
        + +
        +
        +
        +
        + +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +

        Local News

        +
        +
        +
        +
        + + Find Affiliate + + Edit Location + + + +
        +
        +
        +
        +
        +
        +
        + + + +
        +
        +
        +
        +
        +
        + +
        +
        Fair
        +

        +

        + +
        +
        +
        +
        +
        + Forecast from: The Weather Channel +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        + +
        +
        + +
        +
        +
        +
        + +
        +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        +
        +
        +
        +
        +
        +
        +
        +
        +

        + U.S. News +

        +
        +
        + +
        +
        +
        +
        +
        + + +
        +
        +
        +
        + +
        +
        +
        + +
        +
        +
        +
        +
        +
        +
        +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        + +
        +
        +
        + +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +

        + Pop Culture +

        +
        + +
        +
        +
        +
        +
        + + +
        +
        +
        +
        + +
        +
        +
        +
        +
        +
        +
        +
        +

        + Lifestyle +

        +
        + +
        +
        +
        +
        +
        + + +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +

        + Investigations +

        +
        + +
        +
        +
        +
        +
        + + +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +

        + NBC OUT +

        +
        + +
        +
        +
        +
        + +
        +
        +
        +
        +
        +
        +
        +
        +
        +

        + Latino +

        +
        + +
        +
        +
        +
        +
        + + +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +

        + Asian America +

        +
        + +
        +
        +
        +
        +
        + + +
        +
        +
        +
        + +
        +
        + +
        +
        + +
        +
        +
        + +
        +
        +
        +
        + +
        +
        +
        +
        + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scratch/temp3.html b/scratch/temp3.html new file mode 100644 index 0000000..c983423 --- /dev/null +++ b/scratch/temp3.html @@ -0,0 +1,4040 @@ + +CBS News - Breaking News, Live News stream 24x7 + + + + + + + + + + +
        + + +
        + + +
        + + + + +
        +
        +
        + + +
        +
        +
        +
        + + +
        +
        + + +
        + +
        + + +
        +
        + + + + + + + + +
        +
        +
        +
        + + + + +
        +
        +
        +
        + + +
        +
        +
        +
        + + +
        +
        + + + + + + + + + + + + + + + + +
        +
        +
        +
        + + +
        +
        + + + + + + + + + + + + + + + + +
        +
        +
        + +
        + + + + +
        +
        +
        + +
        + + + + +
        + + + + + + diff --git a/scratch/temp_article.html b/scratch/temp_article.html new file mode 100644 index 0000000..63bb16c --- /dev/null +++ b/scratch/temp_article.html @@ -0,0 +1,950 @@ + +Eye Opener: Major snowstorm blankets Northeast - Videos - CBS News + + + + + +
        + + + + + + +
        +
        Loading...
        +
        + +
        +
        + + + +
        +
        +
        +
        +
        +
        + + + + +

        Eye Opener: Major snowstorm blankets Northeast

        +
        +
        +

        | + A massive winter storm is battering the Northeast right now, affecting 50 million Americans. Also, Supreme Court nominee Neil Gorsuch says President Trump's criticism of judges is "demoralizing." All that and all that matters in today's Eye Opener. Your world in 90 seconds. Get the Eye Opener delivered straight to your inbox. +

        +
        +
        + + + +
        + +
        + +
        +
        +
        +
        + + + +
        + + + + + + diff --git a/unbiased.html b/unbiased.html new file mode 100644 index 0000000..228deef --- /dev/null +++ b/unbiased.html @@ -0,0 +1,110 @@ + + + + + UnBiased + + + + + + + + diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py new file mode 100644 index 0000000..84b4ec9 --- /dev/null +++ b/unbiasedFunctions.py @@ -0,0 +1,248 @@ +from unbiasedObjects import * +import os +import random +import time + +#take in a url and delimiters, return twitter card +def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): + + print(sourceName) + print(url) + print() + #download url + os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) + + #read the file in + f=open('scratch/temp_article.html', 'r', encoding="utf8") + content=f.read() + f.close() + + #because the quote separator could be ' or ", trim to just before it then lop it off + img=content.split('og:image" content=')[1][1:].split('>')[0]#[:-1] + if img[-1]=='/': + img=img[:-1].strip() + img=img[:-1] + + title=content.split('og:title" content=')[1][1:].split('>')[0]#[1].split('"')[0] + if title[-1]=='/': + title=title[:-1].strip() + title=title[:-1] + + a=Article(title, url, img, sourceName) + return a + + +#do the hardcore HTML parsing +def splitHTML(content, sectionDividerStart, sectionDividerEnd, delStart, delEnd): + retArr=[] + + if sectionDividerStart!=None: + content=content.split(sectionDividerStart)[1] + if sectionDividerEnd!=None: + content=content.split(sectionDividerEnd)[0] + if delStart!=[]: + while True: + x=content + for delim in delStart: + if delim in content: + x=content.split(delim)[1] + x=x.split(delEnd)[0] + if x not in retArr: + retArr.append(x) + content=content.split(delStart[0], 1) + if(len(content)==1): + break + else: + content=content[1:][0] + + return retArr + + + +''' +**********************8 + +Need to fix this function to use splitHTML() and actually loop through +all of the links instead of just using the first one. + +************************ +''' + +#take in a read main source file (e.g. from nytimes.com) and return lists of the urls for stories +def extractURLs(content, source): + h1s=[] + h2s=[] + h3s=[] + + h1=content + if source.h1SectionDividerStart!=None: + h1=h1.split(source.h1SectionDividerStart)[1] + if source.h1SectionDividerEnd!=None: + h1=h1.split(source.h1SectionDividerEnd)[0] + for delim in source.h1DelStart: + h1=h1.split(delim)[1] + h1=h1.split(source.h1DelEnd)[0] + if '.com' not in h1: + h1=source.url+h1 + h1s.append(h1) + + + + h2=content + if source.h2SectionDividerStart!=None: + h2=h2.split(source.h2SectionDividerStart, 1)[1] + if source.h2SectionDividerEnd!=None: + h2=h2.split(source.h2SectionDividerEnd, 1)[0] + + while source.h2DelStart[0] in h2: + x=h2 + for delim in source.h2DelStart: + x=x.split(delim)[1] + h2=h2.split(delim, 1)[1] + x=x.split(source.h2DelEnd)[0] + h2=h2.split(source.h2DelEnd, 1)[1] + if '.com' not in x: + x=source.url+x + h2s.append(x) + + + + + ''' + h2=content.split(source.h2SectionDividerStart, 1)[1] + h2=h2.split(source.h2SectionDividerEnd, 1)[0] + + if source.h2DelStart!=[]: + while True: + x=h2 + for delim in source.h2DelStart: + if delim in h2: + x=h2.split(delim)[1] + x=x.split(source.h2DelEnd)[0] + if '.com' not in x: + x=source.url+x + if x not in h2s: + h2s.append(x) + print(x) + h2=h2.split(source.h2DelStart[0], 1) + if(len(h2)==1): + break + else: + h2=h2[1]#:][0] + + + + h2s=splitHTML(content, + source.h2SectionDividerStart, + source.h2SectionDividerEnd, + source.h2DelStart, + source.h2DelEnd) + + if source.h2SectionDividerStart!=None: + h2=h2.split(source.h2SectionDividerStart)[1] + if source.h2SectionDividerEnd!=None: + h2=h2.split(source.h2SectionDividerEnd)[0] + + delim0=source.h2DelStart[0] + while delim0 in h2: + for delim in source.h2DelStart: + url=h2.split(delim)[1] + h2=''.join(h2.split(delim)[1:]) + url=h2.split(source.h2DelEnd)[0] + h2=h2.split(source.h2DelEnd)[1] + if '.com' not in url: + url=source.url+url + h2s.append(url) + print(len(h2s)) + + h3s=splitHTML(content, + source.h3SectionDividerStart, + source.h3SectionDividerEnd, + source.h3DelStart, + source.h3DelEnd) + ''' + + return h1s, h2s, h3s + + +def buildOutput(newsSourceArr): + #read in the template html file + f=open('html_template/template.html', 'r') + template=f.read() + f.close() + + #set the random order for sources + h1RandomSources=random.sample(range(len(newsSourceArr)), 4) + h2RandomSources=random.sample(range(len(newsSourceArr)), 4) + ''' + print(h3RandomSources) + h2RandomSources=random.sample(range(len(newsSourceArr)), 1) + print(h3RandomSources) + ''' + + #replace html template locations with data from newsSourceArr + for i in range(len(h1RandomSources)): + source=newsSourceArr[h1RandomSources[i]] + randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] + article=source.h1Arr[randomArticle] + template=template.replace('xxURL1-'+str(i+1)+'xx', article.url) + template=template.replace('xxTitle1-'+str(i+1)+'xx', article.title) + template=template.replace('xxImg1-'+str(i+1)+'xx', article.img) + + + for i in range(len(h2RandomSources)): + source=newsSourceArr[h2RandomSources[i]] + randomArticle=random.sample(range(len(source.h2Arr)), 1)[0] + article=source.h2Arr[randomArticle] + template=template.replace('xxURL2-'+str(i+1)+'xx', article.url) + template=template.replace('xxTitle2-'+str(i+1)+'xx', article.title) + template=template.replace('xxImg2-'+str(i+1)+'xx', article.img) + + + + #return updated text + return template + +def printOutputHTML(outputHTML, outFile): + timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) + outputHTML=outputHTML.replace('xxTimexx', timestamp) + + f=open(outFile, 'w') + f.write(outputHTML) + f.close() + +def buildNewsSourceArr(sourceList): + + #build the data structure + for i in range(len(sourceList)): + source=sourceList[i] + url=source.url + + #download file + os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) + + #read file + f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8") + content=f.read() + f.close() + + #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS + #os.remove('scratch/temp'+str(i)+'.html') + + #add stories etc to the NewsSource object + h1s, h2s, h3s=extractURLs(content, source) + + #build the Article objects and add to newsSource's appropriate list + for url in h1s: + article=buildArticle(url, source.name) + source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) + for url in h2s: + article=buildArticle(url, source.name) + sourceList[i].h2Arr.append(article) + for url in h3s: + article=buildArticle(url, source.name) + sourceList[i].h3Arr.append(article) + + #return the original sourceList, + #since everything should have been modified in place + return sourceList diff --git a/unbiasedObjects.py b/unbiasedObjects.py new file mode 100644 index 0000000..dd71711 --- /dev/null +++ b/unbiasedObjects.py @@ -0,0 +1,67 @@ +class Article(): + title='' + url='' + img='' + source='' + + def __init__(self, title, url, img, source): + self.title=title + self.url=url + self.img=img + self.source=source + + def __str__(self): + return '-----------\n'+self.title+'\n'+self.source+'\n'+self.url+'\n'+self.img+'\n'+'-----------' + + +class NewsSource(): + name='' + url='' + #multiple start values to step through file. end value default to '"' + h1SectionDividerStart=None + h1SectionDividerEnd=None + h1DelStart=[] + h1DelEnd='"' + h2SectionDividerStart=None + h2SectionDividerEnd=None + h2DelStart=[] + h2DelEnd='"' + h3SectionDividerStart=None + h3SectionDividerEnd=None + h3DelStart=[] + h3DelEnd='"' + #arrays of Article object types + h1Arr=None + h2Arr=None + h3Arr=None + + def __init__(self, name, url, + h1DelStart, h2DelStart, h3DelStart, + h1SectionDividerStart=None, h1SectionDividerEnd=None, + h2SectionDividerStart=None, h2SectionDividerEnd=None, + h3SectionDividerStart=None, h3SectionDividerEnd=None): + self.name=name + self.url=url + self.h1DelStart=h1DelStart + self.h2DelStart=h2DelStart + self.h3DelStart=h3DelStart + self.h1SectionDividerStart=h1SectionDividerStart + self.h2SectionDividerStart=h2SectionDividerStart + self.h3SectionDividerStart=h3SectionDividerStart + self.h1SectionDividerEnd=h1SectionDividerEnd + self.h2SectionDividerEnd=h2SectionDividerEnd + self.h3SectionDividerEnd=h3SectionDividerEnd + self.h1Arr=[] + self.h2Arr=[] + self.h3Arr=[] + + def addArticle(self, article, level): + if level==1: + self.h1Arr.append(article) + elif level==2: + self.h2Arr.append(article) + elif level==3: + self.h3Arr.append(article) + else: + print("Error: invalid level in NewsSource.addArtlce: ", level) + -- cgit v1.2.3