From 9302f2f7ed6585cd384391753d0d711f672e12a5 Mon Sep 17 00:00:00 2001 From: ssstvinc2 Date: Thu, 9 Feb 2017 20:28:28 -0500 Subject: Deleted a few unnecessary files from the repo --- legacy_py/get_sources.py | 149 ----------------------------------------------- 1 file changed, 149 deletions(-) delete mode 100755 legacy_py/get_sources.py (limited to 'legacy_py/get_sources.py') diff --git a/legacy_py/get_sources.py b/legacy_py/get_sources.py deleted file mode 100755 index cb23f61..0000000 --- a/legacy_py/get_sources.py +++ /dev/null @@ -1,149 +0,0 @@ -from bs4 import BeautifulSoup -import wget -import os - -def getHeds(heds, h_class=None): - hed_text=[] - hed_hrefs=[] - #print(heds[0]) - for hed in heds: - #print('*'+hed.string) - if hed.a: - try: - if h_class==None or hed['class'][0]==h_class: - if hed.a.string!=None: hed_text.append(hed.a.string.strip()) - if hed.a['href']!='': hed_hrefs.append(hed.a['href']) - except: - continue - return hed_text, hed_hrefs - - - -def get_nyt(): - #init vars - stories={} - - #download and soup web page - wget.download('https://nytimes.com', out="nyt.html") - soup = BeautifulSoup(open("nyt.html", encoding="utf8"), "lxml") - os.remove('nyt.html') - - #get top story - h1s=soup('h1') - h1_heds, h1_href = getHeds(h1s, 'story-heading') - stories['h1']={'heds':[h1_heds[0]], 'href':[h1_href[0]]} - - #get secondary stories - aCol = soup.find_all('div', 'a-column') - h2s=aCol[0].find_all('h2') - h2_heds, h2_href = getHeds(h2s, 'story-heading') - stories['h2']={'heds':h2_heds, 'href':h2_href} - - #get tertiary stories - bCol = soup.find_all('div', 'b-column') - h3s=bCol[0].find_all('h2') - h3_heds, h3_href = getHeds(h3s, 'story-heading') - stories['h3']={'heds':h3_heds, 'href':h3_href} - - return stories - - -def get_fox(): - #init vars - stories={} - h2_heds = [] - h2_href = [] - - #download and soup web page - wget.download('http://www.foxnews.com', out="fox.html") - soup = BeautifulSoup(open("fox.html", encoding="utf8"), "lxml") - os.remove('fox.html') - - #get top story - h1s=soup('h1') - h1_heds, h1_href = getHeds(h1s) - stories['h1']={'heds':[h1_heds[0]], 'href':[h1_href[0]]} - - #get secondary stories -
  • (loop for first ) - topStories = soup('div', 'top-stories') - topStoriesCols = topStories[0].ul - for c in topStoriesCols.children: - if c.string==None: - h2_heds.append(c.h3.text) - h2_href.append(c.a['href']) - - stories['h2']={'heds':h2_heds, 'href':h2_href} - - #get tertiary stories - h3s=topStoriesCols.find_all('li') - h3_heds, h3_href=getHeds(h3s) - for href in h3_href: - if href in h2_href: - h3_href.remove(href) - - stories['h3']={'heds':h3_heds, 'href':h3_href} - - return stories - - -def get_nbc(): - #init vars - stories={} - - #download and soup web page - wget.download('http://www.nbcnews.com', out="nbc.html") - soup = BeautifulSoup(open("nbc.html", encoding="utf8"), "lxml") - os.remove('nbc.html') - - #get top story - panel = soup.find_all('div', 'panel-txt_hero') - h1_heds=panel[0].find_all('h3') - panel = soup.find_all('div', 'panel-txt') - stories['h1']={'heds':[h1_heds[0].text.strip()], 'href':['http://www.nbcnews.com'+panel[0].a['href']]} - - #get secondary stories - div class panel - h2s = soup.find_all('div', 'story-link_default-height') - story_heds=[] - for item in h2s: - story_heds.append(item.h3.text.strip()) - h2_heds, h2_href = getHeds(h2s) - for i in range(len(h2_href)): - h2_href[i]='http://www.nbcnews.com'+h2_href[i] - stories['h2']={'heds':story_heds[:3], 'href':h2_href[:3]} - - #get tertiary stories - div class story-link - stories['h3']={'heds':story_heds[3:], 'href':h2_href[3:]} - - return stories - -def getTwitterCard(url): - card={} - - #wget.download(url, out="card.html") - cmd='wget --no-check-certificate '+url+' -O card.html' - print(cmd) - ret=os.system(cmd) - print(ret) - soup = BeautifulSoup(open("card.html", encoding="utf8"), "lxml") - #os.remove('card.html') - - hed=soup.find_all('meta', {'name' : 'twitter:title'}) - card['hed']=hed[0]['content'] - - img=soup.find_all('meta', {'name' : 'twitter:image'}) - card['img']=img[0]['content'] - - return card - - - -''' -links=get_nbc() -card=getTwitterCard(links['h2']['href'][0]) -print(card) - - -nyt=get_nyt() -fox=get_fox() -nbc=get_nbc() -''' -- cgit v1.2.3