From 3126b094485af5a374d0ba400604a8b2805337af Mon Sep 17 00:00:00 2001 From: ssstvinc2 Date: Thu, 9 Feb 2017 10:18:18 -0500 Subject: VIM is fucking garbage --- legacy_py/downloadFiles.py | 24 +++++++ legacy_py/get_sources.py | 149 ++++++++++++++++++++++++++++++++++++++++++++ legacy_py/update_stories.py | 10 +++ 3 files changed, 183 insertions(+) create mode 100644 legacy_py/downloadFiles.py create mode 100755 legacy_py/get_sources.py create mode 100755 legacy_py/update_stories.py (limited to 'legacy_py') diff --git a/legacy_py/downloadFiles.py b/legacy_py/downloadFiles.py new file mode 100644 index 0000000..9a79e27 --- /dev/null +++ b/legacy_py/downloadFiles.py @@ -0,0 +1,24 @@ +import os + +files=[] +files.append('http://www.nytimes.com') +files.append('http://www.nbcnews.com') +files.append('http://www.foxnews.com') + +stories=[] +stories.append('https://www.nytimes.com/2017/02/03/business/dealbook/trump-congress-financial-regulations.html') +stories.append('http://www.nbcnews.com/news/us-news/over-100-000-visas-have-been-revoked-immigration-ban-justice-n716121') +stories.append('http://www.foxnews.com/politics/2017/02/03/calls-mount-for-trump-administration-to-label-muslim-brotherhood-terrorist-organization.html') + +for i in range(len(files)): + os.system('wget -O source'+str(i+1)+'.html --no-check-certificate '+files[i]) + +for i in range(len(stories)): + os.system('wget -O story'+str(i+1)+'.html --no-check-certificate '+stories[i]) + +''' +f=open('testOut.html', 'r', encoding="utf8") +content=f.read() +f.close() +#os.remove('testOut.html') +''' diff --git a/legacy_py/get_sources.py b/legacy_py/get_sources.py new file mode 100755 index 0000000..cb23f61 --- /dev/null +++ b/legacy_py/get_sources.py @@ -0,0 +1,149 @@ +from bs4 import BeautifulSoup +import wget +import os + +def getHeds(heds, h_class=None): + hed_text=[] + hed_hrefs=[] + #print(heds[0]) + for hed in heds: + #print('*'+hed.string) + if hed.a: + try: + if h_class==None or hed['class'][0]==h_class: + if hed.a.string!=None: hed_text.append(hed.a.string.strip()) + if hed.a['href']!='': hed_hrefs.append(hed.a['href']) + except: + continue + return hed_text, hed_hrefs + + + +def get_nyt(): + #init vars + stories={} + + #download and soup web page + wget.download('https://nytimes.com', out="nyt.html") + soup = BeautifulSoup(open("nyt.html", encoding="utf8"), "lxml") + os.remove('nyt.html') + + #get top story + h1s=soup('h1') + h1_heds, h1_href = getHeds(h1s, 'story-heading') + stories['h1']={'heds':[h1_heds[0]], 'href':[h1_href[0]]} + + #get secondary stories + aCol = soup.find_all('div', 'a-column') + h2s=aCol[0].find_all('h2') + h2_heds, h2_href = getHeds(h2s, 'story-heading') + stories['h2']={'heds':h2_heds, 'href':h2_href} + + #get tertiary stories + bCol = soup.find_all('div', 'b-column') + h3s=bCol[0].find_all('h2') + h3_heds, h3_href = getHeds(h3s, 'story-heading') + stories['h3']={'heds':h3_heds, 'href':h3_href} + + return stories + + +def get_fox(): + #init vars + stories={} + h2_heds = [] + h2_href = [] + + #download and soup web page + wget.download('http://www.foxnews.com', out="fox.html") + soup = BeautifulSoup(open("fox.html", encoding="utf8"), "lxml") + os.remove('fox.html') + + #get top story + h1s=soup('h1') + h1_heds, h1_href = getHeds(h1s) + stories['h1']={'heds':[h1_heds[0]], 'href':[h1_href[0]]} + + #get secondary stories -
  • (loop for first ) + topStories = soup('div', 'top-stories') + topStoriesCols = topStories[0].ul + for c in topStoriesCols.children: + if c.string==None: + h2_heds.append(c.h3.text) + h2_href.append(c.a['href']) + + stories['h2']={'heds':h2_heds, 'href':h2_href} + + #get tertiary stories + h3s=topStoriesCols.find_all('li') + h3_heds, h3_href=getHeds(h3s) + for href in h3_href: + if href in h2_href: + h3_href.remove(href) + + stories['h3']={'heds':h3_heds, 'href':h3_href} + + return stories + + +def get_nbc(): + #init vars + stories={} + + #download and soup web page + wget.download('http://www.nbcnews.com', out="nbc.html") + soup = BeautifulSoup(open("nbc.html", encoding="utf8"), "lxml") + os.remove('nbc.html') + + #get top story + panel = soup.find_all('div', 'panel-txt_hero') + h1_heds=panel[0].find_all('h3') + panel = soup.find_all('div', 'panel-txt') + stories['h1']={'heds':[h1_heds[0].text.strip()], 'href':['http://www.nbcnews.com'+panel[0].a['href']]} + + #get secondary stories - div class panel + h2s = soup.find_all('div', 'story-link_default-height') + story_heds=[] + for item in h2s: + story_heds.append(item.h3.text.strip()) + h2_heds, h2_href = getHeds(h2s) + for i in range(len(h2_href)): + h2_href[i]='http://www.nbcnews.com'+h2_href[i] + stories['h2']={'heds':story_heds[:3], 'href':h2_href[:3]} + + #get tertiary stories - div class story-link + stories['h3']={'heds':story_heds[3:], 'href':h2_href[3:]} + + return stories + +def getTwitterCard(url): + card={} + + #wget.download(url, out="card.html") + cmd='wget --no-check-certificate '+url+' -O card.html' + print(cmd) + ret=os.system(cmd) + print(ret) + soup = BeautifulSoup(open("card.html", encoding="utf8"), "lxml") + #os.remove('card.html') + + hed=soup.find_all('meta', {'name' : 'twitter:title'}) + card['hed']=hed[0]['content'] + + img=soup.find_all('meta', {'name' : 'twitter:image'}) + card['img']=img[0]['content'] + + return card + + + +''' +links=get_nbc() +card=getTwitterCard(links['h2']['href'][0]) +print(card) + + +nyt=get_nyt() +fox=get_fox() +nbc=get_nbc() +''' diff --git a/legacy_py/update_stories.py b/legacy_py/update_stories.py new file mode 100755 index 0000000..4340148 --- /dev/null +++ b/legacy_py/update_stories.py @@ -0,0 +1,10 @@ +from get_sources import * + +nyt=get_nyt() +#fox=get_fox() +#nbc=get_nbc() + +url=nyt['h2']['href'][0] +print(url) +card=getTwitterCard("https://www.nytimes.com/2017/01/30/us/politics/trump-immigration-ban-memo.html") +#print(card) -- cgit v1.2.3