diff options
author | ssstvinc2 <sstvinc2@gmail.com> | 2017-02-19 23:04:31 -0500 |
---|---|---|
committer | ssstvinc2 <sstvinc2@gmail.com> | 2017-02-19 23:04:31 -0500 |
commit | b544a59cb96193ddcd0b8c0f9cc70bda973415a5 (patch) | |
tree | ec5edbe35869f1b5b65a57d1a3b746c83dda3829 | |
parent | 53de97fd3c6fdb4c95a89171b52064a05b157fbf (diff) |
Fixed bounding box on h1s
-rwxr-xr-x[-rw-r--r--] | html_template/BAKtemplate.html | 236 | ||||
-rwxr-xr-x[-rw-r--r--] | html_template/BAKunbiased.css | 206 | ||||
-rwxr-xr-x[-rw-r--r--] | html_template/Penguins.jpg | bin | 777835 -> 777835 bytes | |||
-rw-r--r-- | html_template/newtemplate.html | 300 | ||||
-rwxr-xr-x[-rw-r--r--] | html_template/template.html | 18 | ||||
-rwxr-xr-x[-rw-r--r--] | html_template/unbiased.css | 7 | ||||
-rwxr-xr-x[-rw-r--r--] | main.py | 0 | ||||
-rwxr-xr-x[-rw-r--r--] | parser.py | 1610 |
8 files changed, 1194 insertions, 1183 deletions
diff --git a/html_template/BAKtemplate.html b/html_template/BAKtemplate.html index ab1dbb9..94a3796 100644..100755 --- a/html_template/BAKtemplate.html +++ b/html_template/BAKtemplate.html @@ -1,118 +1,118 @@ -<!DOCTYPE html> -<html> - <head> - <meta charset="utf-8"> - <link rel="stylesheet" href="html_template/unbiased.css"> - <link rel="stylesheet" href="unbiased.css"> - <title>UnBiased</title> - </head> -<body> - -<div id="page-header"> - <span id="title-1" class="title">un</span><span id="title-2" class="title">biased</span><br /> - <span id="subtitle">a different way to read the news</span> - <p id="timestamp">Last updated: xxTimexx</p> -</div> - -<div id="page-container"> -<div id="top-stories"> - <div class="top-story"> - <a target="_blank" href="redirects/h1-1.html" id="top-story-1"> - <div class="top-stories-img"> - <img src="xxImg1-1xx" /> - </div> - <div class="top-stories-hed">xxTitle1-1xx</div> - </a> - <div class="top-stories-desc">xxDesc1-1xx</div> - </div> - - <div class="top-story"> - <a target="_blank" href="redirects/h1-2.html" id="top-story-2"> - <div class="top-stories-img"> - <img src="xxImg1-2xx" /> - </div> - <div class="top-stories-hed">xxTitle1-2xx</div> - </a> - <div class="top-stories-desc">xxDesc1-2xx</div> - </div> - - <div class="top-story"> - <a target="_blank" href="redirects/h1-3.html" id="top-story-3"> - <div class="top-stories-img"> - <img src="xxImg1-3xx" /> - </div> - <div class="top-stories-hed">xxTitle1-3xx</div> - </a> - <div class="top-stories-desc">xxDesc1-3xx</div> - </div> -</div> - -<div id="middle-stories"> - - <a target="_blank" href="redirects/h2-1.html" > - <div class="middle-story"> - <div class="middle-stories-img"> - <img src="xxImg2-1xx" /> - <p class="middle-stories-hed">xxTitle2-1xx</p> - </div> - </div> - </a> - - <a target="_blank" href="redirects/h2-2.html" > - <div class="middle-story"> - <div class="middle-stories-img"> - <img src="xxImg2-2xx" /> - <span class="middle-stories-hed">xxTitle2-2xx</span> - </div> - </div> - </a> - - <a target="_blank" href="redirects/h2-3.html" > - <div class="middle-story"> - <div class="middle-stories-img"> - <img src="xxImg2-3xx" /> - <span class="middle-stories-hed">xxTitle2-3xx</span> - </div> - </div> - </a> - - - <a target="_blank" href="redirects/h2-4.html" > - <div class="middle-story"> - <div class="middle-stories-img"> - <img src="xxImg2-4xx" /> - <span class="middle-stories-hed">xxTitle2-4xx</span> - </div> - </div> - </a> - - <a target="_blank" href="redirects/h2-5.html" > - <div class="middle-story"> - <div class="middle-stories-img"> - <img src="xxImg2-5xx" /> - <span class="middle-stories-hed">xxTitle2-5xx</span> - </div> - </div> - </a> - - <a target="_blank" href="redirects/h2-6.html" > - <div class="middle-story"> - <div class="middle-stories-img"> - <img src="xxImg2-6xx" /> - <span class="middle-stories-hed">xxTitle2-6xx</span> - </div> - </div> - </a> - -</div> - -<div id="bottom-stories"> -</div> - -</div> - -<div id="sources"> - Sources: xxSourcesxx -</div> -</body> -</html> +<!DOCTYPE html>
+<html>
+ <head>
+ <meta charset="utf-8">
+ <link rel="stylesheet" href="html_template/unbiased.css">
+ <link rel="stylesheet" href="unbiased.css">
+ <title>UnBiased</title>
+ </head>
+<body>
+
+<div id="page-header">
+ <span id="title-1" class="title">un</span><span id="title-2" class="title">biased</span><br />
+ <span id="subtitle">a different way to read the news</span>
+ <p id="timestamp">Last updated: xxTimexx</p>
+</div>
+
+<div id="page-container">
+<div id="top-stories">
+ <div class="top-story">
+ <a target="_blank" href="redirects/h1-1.html" id="top-story-1">
+ <div class="top-stories-img">
+ <img src="xxImg1-1xx" />
+ </div>
+ <div class="top-stories-hed">xxTitle1-1xx</div>
+ </a>
+ <div class="top-stories-desc">xxDesc1-1xx</div>
+ </div>
+
+ <div class="top-story">
+ <a target="_blank" href="redirects/h1-2.html" id="top-story-2">
+ <div class="top-stories-img">
+ <img src="xxImg1-2xx" />
+ </div>
+ <div class="top-stories-hed">xxTitle1-2xx</div>
+ </a>
+ <div class="top-stories-desc">xxDesc1-2xx</div>
+ </div>
+
+ <div class="top-story">
+ <a target="_blank" href="redirects/h1-3.html" id="top-story-3">
+ <div class="top-stories-img">
+ <img src="xxImg1-3xx" />
+ </div>
+ <div class="top-stories-hed">xxTitle1-3xx</div>
+ </a>
+ <div class="top-stories-desc">xxDesc1-3xx</div>
+ </div>
+</div>
+
+<div id="middle-stories">
+
+ <a target="_blank" href="redirects/h2-1.html" >
+ <div class="middle-story">
+ <div class="middle-stories-img">
+ <img src="xxImg2-1xx" />
+ <p class="middle-stories-hed">xxTitle2-1xx</p>
+ </div>
+ </div>
+ </a>
+
+ <a target="_blank" href="redirects/h2-2.html" >
+ <div class="middle-story">
+ <div class="middle-stories-img">
+ <img src="xxImg2-2xx" />
+ <span class="middle-stories-hed">xxTitle2-2xx</span>
+ </div>
+ </div>
+ </a>
+
+ <a target="_blank" href="redirects/h2-3.html" >
+ <div class="middle-story">
+ <div class="middle-stories-img">
+ <img src="xxImg2-3xx" />
+ <span class="middle-stories-hed">xxTitle2-3xx</span>
+ </div>
+ </div>
+ </a>
+
+
+ <a target="_blank" href="redirects/h2-4.html" >
+ <div class="middle-story">
+ <div class="middle-stories-img">
+ <img src="xxImg2-4xx" />
+ <span class="middle-stories-hed">xxTitle2-4xx</span>
+ </div>
+ </div>
+ </a>
+
+ <a target="_blank" href="redirects/h2-5.html" >
+ <div class="middle-story">
+ <div class="middle-stories-img">
+ <img src="xxImg2-5xx" />
+ <span class="middle-stories-hed">xxTitle2-5xx</span>
+ </div>
+ </div>
+ </a>
+
+ <a target="_blank" href="redirects/h2-6.html" >
+ <div class="middle-story">
+ <div class="middle-stories-img">
+ <img src="xxImg2-6xx" />
+ <span class="middle-stories-hed">xxTitle2-6xx</span>
+ </div>
+ </div>
+ </a>
+
+</div>
+
+<div id="bottom-stories">
+</div>
+
+</div>
+
+<div id="sources">
+ Sources: xxSourcesxx
+</div>
+</body>
+</html>
diff --git a/html_template/BAKunbiased.css b/html_template/BAKunbiased.css index 49b6dce..ade390b 100644..100755 --- a/html_template/BAKunbiased.css +++ b/html_template/BAKunbiased.css @@ -1,104 +1,104 @@ -a:link, a:visited, a:hover, a:active { - color: #00f; - text-decoration:none; - } - -#page-header{ - text-align:center; - padding:.5em 0 1em; - margin-bottom:1em; - border-bottom:1px solid #000; -} - -.title{ - font-size:3em; -} - -#title-1{ - font-style:italic; - color:#d00; -} - -#title-2{ - color:#00d; -} - -#subtitle{ - font-size:1.25em; -} - -#timestamp{ - margin:.5em 0 0 0; - font-size:.8em; -} - -#page-container{ - width:1150px; - padding:0 1em; - margin-left:auto; - margin-right:auto; -} - -#top-stories{ - width:1150px; - margin-left:auto; - margin-right:auto; - font-size:1.25em; -} - -.top-story{ - width:350px; - float:left; - margin:0 .5em; -} - -.top-stories-img{ - width:350px; - height:200px; - overflow:hidden; -} - -.top-stories-img img{ - width:100%; - display:block; - vertical-align:text-bottom; -} - -.top-stories-desc{ - font-size:.8em; - padding-top:.5em; -} - -#middle-stories{ - clear:both; - width:1000px; - margin:0 auto; -} - -.middle-story{ - margin:2em 5px; - width:45%; - float:left; - height:100px; -} - -.middle-story img{ - vertical-align:middle; - height:100px; - float:left; - margin-right:1em; -} - -.middle-stories-hed{ - font-size:1.1em; -} - -.middle-story p{ - display:block; -} - -#sources{ - clear:both; - padding-top:4em; - font-size:.8em; +a:link, a:visited, a:hover, a:active {
+ color: #00f;
+ text-decoration:none;
+ }
+
+#page-header{
+ text-align:center;
+ padding:.5em 0 1em;
+ margin-bottom:1em;
+ border-bottom:1px solid #000;
+}
+
+.title{
+ font-size:3em;
+}
+
+#title-1{
+ font-style:italic;
+ color:#d00;
+}
+
+#title-2{
+ color:#00d;
+}
+
+#subtitle{
+ font-size:1.25em;
+}
+
+#timestamp{
+ margin:.5em 0 0 0;
+ font-size:.8em;
+}
+
+#page-container{
+ width:1150px;
+ padding:0 1em;
+ margin-left:auto;
+ margin-right:auto;
+}
+
+#top-stories{
+ width:1150px;
+ margin-left:auto;
+ margin-right:auto;
+ font-size:1.25em;
+}
+
+.top-story{
+ width:350px;
+ float:left;
+ margin:0 .5em;
+}
+
+.top-stories-img{
+ width:350px;
+ height:200px;
+ overflow:hidden;
+}
+
+.top-stories-img img{
+ width:100%;
+ display:block;
+ vertical-align:text-bottom;
+}
+
+.top-stories-desc{
+ font-size:.8em;
+ padding-top:.5em;
+}
+
+#middle-stories{
+ clear:both;
+ width:1000px;
+ margin:0 auto;
+}
+
+.middle-story{
+ margin:2em 5px;
+ width:45%;
+ float:left;
+ height:100px;
+}
+
+.middle-story img{
+ vertical-align:middle;
+ height:100px;
+ float:left;
+ margin-right:1em;
+}
+
+.middle-stories-hed{
+ font-size:1.1em;
+}
+
+.middle-story p{
+ display:block;
+}
+
+#sources{
+ clear:both;
+ padding-top:4em;
+ font-size:.8em;
}
\ No newline at end of file diff --git a/html_template/Penguins.jpg b/html_template/Penguins.jpg Binary files differindex 030ab8a..030ab8a 100644..100755 --- a/html_template/Penguins.jpg +++ b/html_template/Penguins.jpg diff --git a/html_template/newtemplate.html b/html_template/newtemplate.html index 923dee2..0cec766 100644 --- a/html_template/newtemplate.html +++ b/html_template/newtemplate.html @@ -1,150 +1,150 @@ -<!DOCTYPE html> -<html> - <head> - <meta charset="utf-8"> - <link rel="stylesheet" href="unbiased.css"> - <title>UnBiased</title> - </head> -<body> - -<div id="page-header"> - <span id="title-1" class="title">un</span><span id="title-2" class="title">biased</span><br /> - <span id="subtitle">a different way to read the news</span> - <p id="timestamp">Last updated: Mon, Feb 13, 7:51pm EST</p> -</div> - -<div id="page-container"> - <div id="top-stories"> - - <div class="top-story"> - <a target="_blank" id="top-story-1" href="" onclick="location.href='xxURL1-1'"> - <div class="top-stories-img" style="background-image: url('http://www.theblaze.com/wp-content/uploads/2017/02/GettyImages-465794068-1280x720.jpg');" /> - </div> - <div class="top-stories-hed">Rand Paul and Cory Booker push bipartisan effort to limit solitary confinement for juveniles</div> - </a> - <div class="top-stories-desc">Sen. Rand Paul (R-Ky) and Sen …</div> - </div> - - <div class="top-story"> - <a target="_blank" href="" onclick="location.href='xxURL1-2'"> - <div class="top-stories-img" style="background-image: url('http://cdn.weeklystandard.biz/cache/r960-90b8d8d5cbcef212ecae2a5c455fed8f.jpg');" /> - </div> - <div class="top-stories-hed">Bibi and Donald</div> - </a> - <div class="top-stories-desc">This week, Israel's prime minister will visit Washington and meet with our new president. They will have a complex agenda. Benjamin ...</div> - </div> - - <div class="top-story"> - <a target="_blank" href="" onclick="location.href='xxURL1-3'"> - <div class="top-stories-img" style="background-image: url('https://static01.nyt.com/images/2017/02/13/multimedia/DavidOyelowo-UnitedKingdom/DavidOyelowo-UnitedKingdom-facebookJumbo.png');" /> - </div> - <div class="top-stories-hed">David Oyelowo on How to Play a Real King</div> - </a> - <div class="top-stories-desc">He stars in “A United Kingdom,” about the Botswana leader who married a white woman and set off an international crisis.</div> - </div> - - <div class="top-story"> - <a target="_blank" href="" onclick="location.href='xxURL1-4'"> - <div class="top-stories-img" style="background-image: url('http://a57.foxnews.com/images.foxnews.com/content/fox-news/us/2017/02/13/judge-orders-ohio-village-to-pay-back-3-million-to-lead-footed-drivers/_jcr_content/par/featured-media/media-0.img.jpg/0/0/1487019011476.jpg?ve=1');" /> - </div> - <div class="top-stories-hed">Judge orders Ohio village to pay back $3 million to lead-footed drivers</div> - </a> - <div class="top-stories-desc">Speed cameras became a cash cow for the small village of New Miami, Ohio.</div> - </div> - - </div> - - <div id="middle-stories"> - - <a target="_blank" href="" onclick="location.href='xxURL2-1'"> - <div class="middle-story"> - <div class="middle-stories-img" style="background-image: url('http://www.theblaze.com/wp-content/uploads/2017/02/GettyImages-635148734-1280x720.jpg');"> - </div> - <div class="middle-stories-hed">DHS says 75 percent of those detained in ICE raids last week were ‘criminal aliens’</div> - </div> - </a> - - <a target="_blank" href="" onclick="location.href='xxURL2-2'"> - <div class="middle-story"> - <div class="middle-stories-img" style="background-image: url('http://a57.foxnews.com/media2.foxnews.com/BrightCove/694940094001/2017/02/12/0/0/694940094001_5320280093001_5320267547001-vs.jpg?ve=1');"> - </div> - <div class="middle-stories-hed">Drama grips Trump inner circle, as president charges ahead on agenda</div> - </div> - </a> - - <a target="_blank" href="" onclick="location.href='xxURL2-3'"> - <div class="middle-story"> - <div class="middle-stories-img" style="background-image: url('http://ichef.bbci.co.uk/news/1024/cpsprodpb/C9C5/production/_94635615_6c33162f-1c24-487d-8a51-bb7b13ec063f.jpg');"> - </div> - <div class="middle-stories-hed">Ku Klux Klan killing: Frank Ancona's wife and stepson charged - BBC News</div> - </div> - </a> - - <a target="_blank" href="" onclick="location.href='xxURL2-4'"> - <div class="middle-story"> - <div class="middle-stories-img" style="background-image: url('http://media1.s-nbcnews.com/j/newscms/2017_07/1900281/13217-oroville-dam-724a-rs_4a8b5ba9690488f11410f156833e1b70.nbcnews-fp-1200-800.jpg');"> - </div> - <div class="middle-stories-hed">Nearly 190,000 ordered to evacuate in California dam spillway failure</div> - </div> - </a> - - <a target="_blank" href="" > - <div class="middle-story"> - <div class="middle-stories-img" style="background-image: url('http://cbsnews1.cbsistatic.com/hub/i/2017/02/13/4ad800d9-69ba-4102-a8ec-af12e8eb6adb/021317-news.jpg');"> - </div> - <div class="middle-stories-hed">Jerry Sandusky's son, 41, arrested on child sex charges</div> - </div> - </a> - - <a target="_blank" href="" > - <div class="middle-story"> - <div class="middle-stories-img" style="background-image: url('https://static01.nyt.com/images/2017/02/14/us/14townhall1/14townhall1-facebookJumbo.jpg');"> - </div> - <div class="middle-stories-hed">Angry Town Hall Meetings on Health Care Law, and Few Answers</div> - </div> - </a> - - - </div> - - <div id="bottom-stories"> - <div class="bottom-story"> - <a target="_blank" href="">xxTitle3-1xx</a> - </div> - - <div class="bottom-story"> - <a target="_blank" href="">xxTitle3-2xx</a> - </div> - - <div class="bottom-story"> - <a target="_blank" href="">xxTitle3-3xx</a> - </div> - - <div class="bottom-story"> - <a target="_blank" href="">xxTitle3-4xx</a> - </div> - - <div class="bottom-story"> - <a target="_blank" href="">xxTitle3-5xx</a> - </div> - - <div class="bottom-story"> - <a target="_blank" href="">xxTitle3-6xx</a> - </div> - - <div class="bottom-story"> - <a target="_blank" href="">xxTitle3-7xx</a> - </div> - - <div class="bottom-story"> - <a target="_blank" href="">xxTitle3-8xx</a> - </div> -</div> - -</div> - -<div id="sources"> - Sources: BBC US, NBC News, CBS News, The Blaze, Weekly Standard, New York Times, Fox News -</div> -</body> -</html> +<!DOCTYPE html>
+<html>
+ <head>
+ <meta charset="utf-8">
+ <link rel="stylesheet" href="unbiased.css">
+ <title>UnBiased</title>
+ </head>
+<body>
+
+<div id="page-header">
+ <span id="title-1" class="title">un</span><span id="title-2" class="title">biased</span><br />
+ <span id="subtitle">a different way to read the news</span>
+ <p id="timestamp">Last updated: Mon, Feb 13, 7:51pm EST</p>
+</div>
+
+<div id="page-container">
+ <div id="top-stories">
+
+ <div class="top-story">
+ <a target="_blank" id="top-story-1" href="" onclick="location.href='xxURL1-1'">
+ <div class="top-stories-img" style="background-image: url('http://www.theblaze.com/wp-content/uploads/2017/02/GettyImages-465794068-1280x720.jpg');" />
+ </div>
+ <div class="top-stories-hed">Rand Paul and Cory Booker push bipartisan effort to limit solitary confinement for juveniles</div>
+ </a>
+ <div class="top-stories-desc">Sen. Rand Paul (R-Ky) and Sen …</div>
+ </div>
+
+ <div class="top-story">
+ <a target="_blank" href="" onclick="location.href='xxURL1-2'">
+ <div class="top-stories-img" style="background-image: url('http://cdn.weeklystandard.biz/cache/r960-90b8d8d5cbcef212ecae2a5c455fed8f.jpg');" />
+ </div>
+ <div class="top-stories-hed">Bibi and Donald</div>
+ </a>
+ <div class="top-stories-desc">This week, Israel's prime minister will visit Washington and meet with our new president. They will have a complex agenda. Benjamin ...</div>
+ </div>
+
+ <div class="top-story">
+ <a target="_blank" href="" onclick="location.href='xxURL1-3'">
+ <div class="top-stories-img" style="background-image: url('https://static01.nyt.com/images/2017/02/13/multimedia/DavidOyelowo-UnitedKingdom/DavidOyelowo-UnitedKingdom-facebookJumbo.png');" />
+ </div>
+ <div class="top-stories-hed">David Oyelowo on How to Play a Real King</div>
+ </a>
+ <div class="top-stories-desc">He stars in “A United Kingdom,” about the Botswana leader who married a white woman and set off an international crisis.</div>
+ </div>
+
+ <div class="top-story">
+ <a target="_blank" href="" onclick="location.href='xxURL1-4'">
+ <div class="top-stories-img" style="background-image: url('http://a57.foxnews.com/images.foxnews.com/content/fox-news/us/2017/02/13/judge-orders-ohio-village-to-pay-back-3-million-to-lead-footed-drivers/_jcr_content/par/featured-media/media-0.img.jpg/0/0/1487019011476.jpg?ve=1');" />
+ </div>
+ <div class="top-stories-hed">Judge orders Ohio village to pay back $3 million to lead-footed drivers</div>
+ </a>
+ <div class="top-stories-desc">Speed cameras became a cash cow for the small village of New Miami, Ohio.</div>
+ </div>
+
+ </div>
+
+ <div id="middle-stories">
+
+ <a target="_blank" href="" onclick="location.href='xxURL2-1'">
+ <div class="middle-story">
+ <div class="middle-stories-img" style="background-image: url('http://www.theblaze.com/wp-content/uploads/2017/02/GettyImages-635148734-1280x720.jpg');">
+ </div>
+ <div class="middle-stories-hed">DHS says 75 percent of those detained in ICE raids last week were ‘criminal aliens’</div>
+ </div>
+ </a>
+
+ <a target="_blank" href="" onclick="location.href='xxURL2-2'">
+ <div class="middle-story">
+ <div class="middle-stories-img" style="background-image: url('http://a57.foxnews.com/media2.foxnews.com/BrightCove/694940094001/2017/02/12/0/0/694940094001_5320280093001_5320267547001-vs.jpg?ve=1');">
+ </div>
+ <div class="middle-stories-hed">Drama grips Trump inner circle, as president charges ahead on agenda</div>
+ </div>
+ </a>
+
+ <a target="_blank" href="" onclick="location.href='xxURL2-3'">
+ <div class="middle-story">
+ <div class="middle-stories-img" style="background-image: url('http://ichef.bbci.co.uk/news/1024/cpsprodpb/C9C5/production/_94635615_6c33162f-1c24-487d-8a51-bb7b13ec063f.jpg');">
+ </div>
+ <div class="middle-stories-hed">Ku Klux Klan killing: Frank Ancona's wife and stepson charged - BBC News</div>
+ </div>
+ </a>
+
+ <a target="_blank" href="" onclick="location.href='xxURL2-4'">
+ <div class="middle-story">
+ <div class="middle-stories-img" style="background-image: url('http://media1.s-nbcnews.com/j/newscms/2017_07/1900281/13217-oroville-dam-724a-rs_4a8b5ba9690488f11410f156833e1b70.nbcnews-fp-1200-800.jpg');">
+ </div>
+ <div class="middle-stories-hed">Nearly 190,000 ordered to evacuate in California dam spillway failure</div>
+ </div>
+ </a>
+
+ <a target="_blank" href="" >
+ <div class="middle-story">
+ <div class="middle-stories-img" style="background-image: url('http://cbsnews1.cbsistatic.com/hub/i/2017/02/13/4ad800d9-69ba-4102-a8ec-af12e8eb6adb/021317-news.jpg');">
+ </div>
+ <div class="middle-stories-hed">Jerry Sandusky's son, 41, arrested on child sex charges</div>
+ </div>
+ </a>
+
+ <a target="_blank" href="" >
+ <div class="middle-story">
+ <div class="middle-stories-img" style="background-image: url('https://static01.nyt.com/images/2017/02/14/us/14townhall1/14townhall1-facebookJumbo.jpg');">
+ </div>
+ <div class="middle-stories-hed">Angry Town Hall Meetings on Health Care Law, and Few Answers</div>
+ </div>
+ </a>
+
+
+ </div>
+
+ <div id="bottom-stories">
+ <div class="bottom-story">
+ <a target="_blank" href="">xxTitle3-1xx</a>
+ </div>
+
+ <div class="bottom-story">
+ <a target="_blank" href="">xxTitle3-2xx</a>
+ </div>
+
+ <div class="bottom-story">
+ <a target="_blank" href="">xxTitle3-3xx</a>
+ </div>
+
+ <div class="bottom-story">
+ <a target="_blank" href="">xxTitle3-4xx</a>
+ </div>
+
+ <div class="bottom-story">
+ <a target="_blank" href="">xxTitle3-5xx</a>
+ </div>
+
+ <div class="bottom-story">
+ <a target="_blank" href="">xxTitle3-6xx</a>
+ </div>
+
+ <div class="bottom-story">
+ <a target="_blank" href="">xxTitle3-7xx</a>
+ </div>
+
+ <div class="bottom-story">
+ <a target="_blank" href="">xxTitle3-8xx</a>
+ </div>
+</div>
+
+</div>
+
+<div id="sources">
+ Sources: BBC US, NBC News, CBS News, The Blaze, Weekly Standard, New York Times, Fox News
+</div>
+</body>
+</html>
diff --git a/html_template/template.html b/html_template/template.html index c0e0711..41eb86e 100644..100755 --- a/html_template/template.html +++ b/html_template/template.html @@ -16,12 +16,12 @@ <div id="page-container">
<div id="top-stories">
+ <div class="row">
- <div class="top-story">
- <a target="_blank" href="" onclick="window.open('xxURL1-1xx', '_blank')">
- <div class="top-stories-img" style="background-image: url('xxImg1-1xx');" />
- </div>
- <div class="top-stories-hed">xxTitle1-1xx</div>
+ <div class="top-story">
+ <a target="_blank" href="" onclick="window.open('xxURL1-1xx', '_blank')">
+ <div class="top-stories-img" style="background-image: url('xxImg1-1xx');" /></div>
+ <div class="top-stories-hed">xxTitle1-1xx</div>
</a>
<div class="top-stories-desc">xxDesc1-1xx</div>
</div>
@@ -35,6 +35,10 @@ <div class="top-stories-desc">xxDesc1-2xx</div>
</div>
+ </div>
+
+<div class="row">
+
<div class="top-story">
<a target="_blank" href="" onclick="window.open('xxURL1-3xx', '_blank')">
<div class="top-stories-img" style="background-image: url('xxImg1-3xx');" />
@@ -52,7 +56,9 @@ </a>
<div class="top-stories-desc">xxDesc1-4xx</div>
</div>
-
+
+ </div>
+
</div>
<div id="middle-stories">
diff --git a/html_template/unbiased.css b/html_template/unbiased.css index 126e194..c0bb121 100644..100755 --- a/html_template/unbiased.css +++ b/html_template/unbiased.css @@ -71,17 +71,22 @@ a:link, a:visited, a:hover, a:active { margin-bottom: 10px;
}
+.row{
+ display:flex;
+}
+
.top-story{
display:inline-block;
vertical-align:top;
text-align:left;
width:360px;
- height:352px;
+ height:auto;
overflow:hidden;
background:#fff;
margin:10px;
padding:10px;
border:2px solid #ccc;
+ flex:1;
}
@media only screen and (max-width:500px){
diff --git a/parser.py b/parser.py index a537d48..2c22a87 100644..100755 --- a/parser.py +++ b/parser.py @@ -1,805 +1,805 @@ -#!/usr/bin/env python3 - -from unbiasedObjects import * -from unbiasedFunctions import buildArticle -import os -import re - - -''' -Takes in a URL, downloads the file to a temp file, -reads the file into a string, and returns that string -''' -def urlToContent(url): - #download file - os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) - - #read file - f=open('scratch/temp1.html', 'r')#, encoding="utf8") - content=f.read() - f.close() - - return content - - -''' -Creates a new newsSource2 object. For each URL in h1-h3URLs, -calls the file scraper and appends the new Article object. -Returns a newsSource2 object -''' -def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): - h1Arr=[] - h1Arr.append(buildArticle(h1URLs[0], name)) - - h2Arr=[] - for x in h2URLs: - a=buildArticle(x, name) - if a!=None: - h2Arr.append(a) - - h3Arr=[] - for x in h3URLs: - a=buildArticle(x, name) - if a!=None: - h3Arr.append(a) - - #BUILD THE NEWS SOURCE - newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) - - return newsSource - - -''' -Some sites will replicate URLs across the page. This function removes them. -Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; -if h2 exists in h1s, remove from h2s - -also check partial URLs (e.g. nytimes.com/story.html is the same as -nytimes.com/story.html?var=x -''' -def removeDuplicates(h1s, h2s, h3s): - #Assume h1s is one element, and keep it - - #remove h2 duplicates - removeArr=[] - for i in range(len(h2s)): - #check internally - for j in range(len(h2s)): - if i==j: - continue - else: - if h2s[i] in h2s[j]: - removeArr.append(h2s[j]) - #check against h1s - for k in range(len(h1s)): - if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): - removeArr.append(h2s[i]) - for x in removeArr: - h2s.remove(x) - - #remove h3 duplicates - removeArr=[] - for i in range(len(h3s)): - #check internally - for j in range(len(h3s)): - if i==j: - continue - else: - if h3s[i] in h3s[j]: - removeArr.append(h3s[j]) - #check against h1s and h2s - h1and2=h1s+h2s - for k in range(len(h1and2)): - if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): - removeArr.append(h3s[i]) - for x in removeArr: - h3s.remove(x) - - - return h1s, h2s, h3s - - - -def removalNotification(source, title, reason, value): - print('*************************') - print('\t\tSTORY REMOVED') - print('SOURCE: '+source) - print('TITLE: \t'+title) - print('REASON: '+reason) - print('VALUE: \t'+value) - print('*************************\n\n') - - -def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): - - arr=[source.h1Arr, source.h2Arr, source.h3Arr] - - if badTitleArr!=None: - for i in range(len(arr)): - for hed in arr[i]: - for item in badTitleArr: - if item in hed.title: - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, 'Title', item) - - - if badDescArr!=None: - for i in range(len(arr)): - for hed in arr[i]: - for item in badDescArr: - if item in hed.description: - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, 'Description', item) - - - if badAuthorArr!=None: - for i in range(len(arr)): - for hed in arr[i]: - for item in badAuthorArr: - if item in hed.author: - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, 'Author', item) - - - if badImgArr!=None: - for i in range(len(arr)): - for hed in arr[i]: - for item in badImgArr: - if item in hed.img: - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, 'Image', item) - - if badURLArr!=None: - for i in range(len(arr)): - for hed in arr[i]: - for item in badURLArr: - if item in hed.url: - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, 'URL', item) - - return source - - - - -def buildTheHill(): - url='http://thehill.com' - name='The Hill' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('<div class="headline-story-image">', 1)[1] - h1=h1.split('<a href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[url+h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('<div class="section-top-content">', 1)[1] - h2=h2.split('</ul>', 1)[0] - while '<div class="top-story-item' in h2 and len(h2s)<4: - h2=h2.split('<div class="top-story-item', 1)[1] - x=h2.split('<a href="', 1)[1] - x=x.split('"', 1)[0] - h2s.append(url+x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('<div class="section-top-content">', 1)[1] - h3=h3.split('</ul>', 1)[0] - while '<div class="top-story-item small' in h3: - h3=h3.split('<div class="top-story-item small', 1)[1] - x=h3.split('<a href="', 1)[1] - x=x.split('"', 1)[0] - h3s.append(url+x) - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - hil=buildNewsSource2(name, url, h1s, h2s, h3s) - #hil=removeBadStories(gdn, None, None, None, None) - - return hil - - - - - -def buildGuardian(): - url='http://www.theguardian.com/us-news' - name='The Guardian' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('<h1', 1)[1] - h1=h1.split('<a href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - #only the h1 and the two h2s have this, so split on it and grab - #the second two - h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">', 3)[2:] - for x in h2: - x=x.split('<h2 class="fc-item__title"><a href="', 1)[1] - x=x.split('"', 1)[0] - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('<div class="fc-slice-wrapper">', 1)[1] - h3=h3.split('<div class="js-show-more-placeholder">', 1)[0] - #this story section goes on forever; just grab the first 5 - while '<h2 class="fc-item__title"><a href="' in h3: - h3=h3.split('<h2 class="fc-item__title"><a href="', 1)[1] - x=h3.split('"', 1)[0] - h3s.append(x) - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - - gdn=buildNewsSource2(name, url, h1s, h2s, h3s) - gdn=removeBadStories(gdn, None, ['Tom McCarthy'], ['https://www.theguardian.com/profile/ben-jacobs'], None) - - return gdn - - -''' -Function to fix the oddly short og:descriptions provided -in The Blaze articles by grabbing the first portion of the story instead -''' -def blazeFixDesc(articleArr): - TAG_RE = re.compile(r'<[^>]+>') - for i in range(len(articleArr)): - desc=urlToContent(articleArr[i].url) - desc=desc.split('<div class="entry-content article-styles">', 1)[1] - desc=desc.split('<p>', 1)[1] - desc=TAG_RE.sub('', desc) - desc=desc.replace('\n', ' ') - desc=desc[:144] - articleArr[i].description=desc - - return articleArr - - - -def buildBlaze(): - url='http://theblaze.com' - name='The Blaze' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('<!-- home -->', 1)[1] - h1=h1.split('<!-- loop-home -->', 1)[0] - h1=h1.split('<a class="gallery-link" href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[url+h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('<!-- home -->', 1)[1] - h2=h2.split('<!-- loop-home -->', 1)[0] - while '</figure>\n\n<figure class="gallery-item">' in h2: - h2=h2.split('</figure>\n\n<figure class="gallery-item">', 1)[1] - h2=h2.split('href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(url+x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('<!-- loop-home -->', 1)[1] - #this story section goes on forever; just grab the first 5 - while len(h3s)<5: - h3=h3.split('<a class="feed-link" href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(url+x) - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - - - blz=buildNewsSource2(name, url, h1s, h2s, h3s) - blz=removeBadStories(blz, None, ['Lawrence Jones'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka'], None) - - #The Blaze has dumb, short description fields, so we need to grab - #the first x characters of actual article text instead - blz.h1Arr=blazeFixDesc(blz.h1Arr) - blz.h2Arr=blazeFixDesc(blz.h2Arr) - blz.h3Arr=blazeFixDesc(blz.h3Arr) - - return blz - - - -def buildCBS(): - url='http://cbsnews.com' - name='CBS News' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - if '<h1 class="title">' in content: - h1=h1.split('<h1 class="title">', 1)[1] - h1=h1.split('<a href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[url+h1] - else: - #for cases where they lead with a video, pull the first h2 as h1 - h1=h1.split('Big News Area Side Assets', 1)[1] - h1=h1.split('</ul></div>', 1)[0] - h1=h1.split('<li data-tb-region-item>', 1)[1] - h1=h1.split('<a href="', 1)[1] - x=h1.split('"', 1)[0] - h1s=[url+x] - - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('Big News Area Side Assets', 1)[1] - h2=h2.split('</ul></div>', 1)[0] - while '<li data-tb-region-item>' in h2: - h2=h2.split('<li data-tb-region-item>', 1)[1] - h2=h2.split('<a href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(url+x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('Latest News', 1)[1] - #this story section goes on forever; just grab the first 5 - while len(h3s)<5: - h3=h3.split('<li class="item-full-lead"', 1)[1] - h3=h3.split('<a href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(url+x) - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - cbs=buildNewsSource2(name, url, h1s, h2s, h3s) - - return cbs - - - - - -def buildNBC(): - url='http://nbcnews.com' - name='NBC News' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('top-stories-section', 1)[1] - h1=h1.split('panel_hero', 1)[1] - h1=h1.split('<a href="', 1)[1] - h1=h1.split('"', 1)[0] - if '.com' not in h1: - h1=url+h1 - h1s=[h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('ad-content ad-xs mobilebox1', 1)[1] - h2=h2.split('taboola-native-top-stories-thumbnail', 1)[0] - while '<div class="story-link' in h2: - h2=h2.split('<div class="story-link', 1)[1] - h2=h2.split('<a href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - if '.com' not in x: - x=url+x - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('js-more-topstories', 1)[1] - h3=h3.split('<div class="panel-section', 1)[0] - while '<div class="story-link' in h3: - h3=h3.split('<div class="story-link', 1)[1] - h3=h3.split('<a href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - if '.com' not in x: - x=url+x - h3s.append(x) - - #adjust for today.com urls - for arr in [h1s, h2s, h3s]: - for i in range(len(arr)): - if 'today.com' in arr[i]: - arr[i]=arr[i].split('.com', 1)[1] - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - nbc=buildNewsSource2(name, url, h1s, h2s, h3s) - - return nbc - - - - -def buildBBC(): - url='http://www.bbc.com/news/world/us_and_canada' - name='BBC US & Canada' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('buzzard-item', 1)[1] - h1=h1.split('<a href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=['http://www.bbc.com'+h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('<div class="pigeon">', 1)[1] - h2=h2.split('<div id=', 1)[0] - while 'top_stories#' in h2: - h2=h2.split('top_stories#', 1)[1] - h2=h2.split('<a href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append('http://www.bbc.com'+x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('<div class="macaw">', 1)[1] - h3=h3.split('Watch/Listen', 1)[0] - while '<div class="macaw-item' in h3: - h3=h3.split('<div class="macaw-item', 1)[1] - h3=h3.split('<a href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append('http://www.bbc.com'+x) - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - bbc=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE ' - BBC News' from headlines - for i in range(len(bbc.h1Arr)): - if ' - BBC News' in bbc.h1Arr[i].title: - bbc.h1Arr[i].title=bbc.h1Arr[i].title.split(' - BBC News', 1)[0] - for i in range(len(bbc.h2Arr)): - if ' - BBC News' in bbc.h2Arr[i].title: - bbc.h2Arr[i].title=bbc.h2Arr[i].title.split(' - BBC News', 1)[0] - for i in range(len(bbc.h3Arr)): - if ' - BBC News' in bbc.h3Arr[i].title: - bbc.h3Arr[i].title=bbc.h3Arr[i].title.split(' - BBC News', 1)[0] - - return bbc - - - -def buildWeeklyStandard(): - url='http://www.weeklystandard.com' - name='Weekly Standard' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('<div id="region_1"', 1)[1] - h1=h1.split('<div id="region_2"', 1)[0] - h1=h1.split('<div class="lead-photo">', 1)[1] - h1=h1.split('href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('<div class="widget lead-story layout-3col-feature" data-count="2">', 1)[1] - h2=h2.split('<div id="region_2"', 1)[0] - while '<div class="lead-photo">' in h2: - h2=h2.split('<div class="lead-photo">', 1)[1] - h2=h2.split('href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('Today\'s Standard', 1)[1] - h3=h3.split('<div id="region_3"', 1)[0] - while '<div class="lead-photo">' in h3: - h3=h3.split('<div class="lead-photo">', 1)[1] - h3=h3.split('href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(x) - - #Need to add URL prefix to all URLs - for i in range(len(h1s)): - h1s[i]=url+h1s[i] - for i in range(len(h2s)): - h2s[i]=url+h2s[i] - for i in range(len(h3s)): - h3s[i]=url+h3s[i] - - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wkl=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE BAD STORIES - badTitleArr=None - ## if flagged again, remove Micah Mattix - badDescArr=['Matt Labash'] - badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] - badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] - wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) - - return wkl - - - - -def buildNPR(): - url='http://www.npr.org/sections/news/' - name='NPR' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('<a id="mainContent">', 1)[1] - h1=h1.split('<a href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('<article class="item has-image">', 1)[1] - h2=h2.split('<!-- END CLASS=\'FEATURED-3-UP\' -->', 1)[0] - while '<article class="item has-image">' in h2: - h2=h2.split('<article class="item has-image">', 1)[1] - h2=h2.split('<a href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('<div id="overflow" class="list-overflow"', 1)[1] - h3=h3.split('<!-- END ID="OVERFLOW" CLASS="LIST-OVERFLOW"', 1)[0] - while '<h2 class="title"><a href="' in h3: - h3=h3.split('<h2 class="title"><a href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(x) - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - - npr=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE BAD STORIES - badTitleArr=None - badDescArr=None - badAuthorArr=None - badImgArr=None - #npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr) - - return npr - - - - -def buildFoxNews(): - url='http://foxnews.com' - name='Fox News' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('<h1><a href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('<div class="top-stories">', 1)[1] - h2=h2.split('<section id="latest"', 1)[0] - while '<li data-vr-contentbox=""><a href="' in h2: - h2=h2.split('<li data-vr-contentbox=""><a href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('div id="big-top"', 1)[1] - h3=h3.split('<div class="top-stories">', 1)[0] - while '<a href="' in h3: - h3=h3.split('<a href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(x) - - h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) - fox=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE BAD STORIES - badTitleArr=['O'Reilly'] - badDescArr=None - badAuthorArr=['Bill O\'Reilly', 'Sean Hannity'] - badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg'] - badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com'] - fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr) - - return fox - - - -def buildNYT(): - url='http://www.nytimes.com' - name='New York Times' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - #this will likely need if/else logic - h1=content - - if 'story theme-summary banner' in h1: - #This is with a large headline over a and b columns - h1=h1.split('story theme-summary banner', 1)[1] - h1=h1.split('<a href="', 1)[1] - h1=h1.split('"', 1)[0] - else: - #otherwise, pull the first story from the A column - h1=h1.split('<div class="a-column column">', 1)[1] - h1=h1.split('<a href="', 1)[1].split('"', 1)[0] - h1s=[h1] - - - #GET SECONDARY HEADLINES - #This comes from the a column or b column, above the break - h2=content - h2s=[] - #A column - h2=h2.split('<div class="a-column column">', 1)[1] - h2=h2.split('<!-- close a-column -->', 1)[0] - #remove "collection" sets - while '<div class="collection headlines">' in h2: - arr=h2.split('<div class="collection headlines">', 1) - h2=arr[0]+arr[1].split('</ul>', 1)[1] - #Grab the remaining URLs - while '<a href="' in h2: - h2=h2.split('<a href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(x) - - #B column - h2=content - h2=h2.split('<div class="b-column column">', 1)[1] - h2=h2.split('<!-- close b-column -->', 1)[0] - #remove "collection" sets - while '<div class="collection headlines">' in h2: - arr=h2.split('<div class="collection headlines">', 1) - h2=arr[0]+arr[1].split('</ul>', 1)[1] - #Grab the remaining URLs - while '<a href="' in h2: - h2=h2.split('<a href="', 1)[1] - x=h2.split('"', 1)[0] - if (h1 not in x) and (x not in h2s): - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('<!-- close lede-package-region -->', 1)[1] - h3=h3.split('<a href="https://www.nytimes.com/tips">', 1)[0] - #remove "collection" sets - while '<div class="collection headlines">' in h2: - arr=h3.split('<div class="collection headlines">', 1) - h3=arr[0]+arr[1].split('</ul>', 1)[1] - - #Grab the remaining URLs - while '<a href="' in h3: - h3=h3.split('<a href="', 1)[1] - x=h3.split('"', 1)[0] - if (h1 not in x) and (x not in h3s): - h3s.append(x) - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - nyt=buildNewsSource2(name, url, h1s, h2s, h3s) - - return nyt - - - - -''' -NYT -EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS - -<div class="span-ab-layout layout"> - - <div class="ab-column column"> - - <section id="top-news" class="top-news"> - <h2 class="section-heading visually-hidden">Top News</h2> - - <div class="above-banner-region region"> - - <div class="collection"> - <div class="hpHeader" id="top-megapackage-kicker"> - <h6><a href="http://www.nytimes.com/pages/politics/index.html?src=hpHeader">The 45th President</a></h6> -</div> - -</div> - - </div><!-- close above-banner-region --> - - <div class="span-ab-top-region region"> - - <div class="collection"> - <article class="story theme-summary banner" id="topnews-100000004932040" data-story-id="100000004932040" data-rank="0" data-collection-renderstyle="Banner"> - <h1 class="story-heading"><a href="https://www.nytimes.com/2017/02/14/us/politics/fbi-interviewed-mike-flynn.html">F.B.I. Questioned Flynn About Russia Call</a></h1> -</article> -</div> - - </div><!-- close span-ab-top-region --> -''' +#!/usr/bin/env python3
+
+from unbiasedObjects import *
+from unbiasedFunctions import buildArticle
+import os
+import re
+
+
+'''
+Takes in a URL, downloads the file to a temp file,
+reads the file into a string, and returns that string
+'''
+def urlToContent(url):
+ #download file
+ os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
+
+ #read file
+ f=open('scratch/temp1.html', 'r')#, encoding="utf8")
+ content=f.read()
+ f.close()
+
+ return content
+
+
+'''
+Creates a new newsSource2 object. For each URL in h1-h3URLs,
+calls the file scraper and appends the new Article object.
+Returns a newsSource2 object
+'''
+def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
+ h1Arr=[]
+ h1Arr.append(buildArticle(h1URLs[0], name))
+
+ h2Arr=[]
+ for x in h2URLs:
+ a=buildArticle(x, name)
+ if a!=None:
+ h2Arr.append(a)
+
+ h3Arr=[]
+ for x in h3URLs:
+ a=buildArticle(x, name)
+ if a!=None:
+ h3Arr.append(a)
+
+ #BUILD THE NEWS SOURCE
+ newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr)
+
+ return newsSource
+
+
+'''
+Some sites will replicate URLs across the page. This function removes them.
+Check hierarchically: if h3 exists in h1s or h2s, remove from h3s;
+if h2 exists in h1s, remove from h2s
+
+also check partial URLs (e.g. nytimes.com/story.html is the same as
+nytimes.com/story.html?var=x
+'''
+def removeDuplicates(h1s, h2s, h3s):
+ #Assume h1s is one element, and keep it
+
+ #remove h2 duplicates
+ removeArr=[]
+ for i in range(len(h2s)):
+ #check internally
+ for j in range(len(h2s)):
+ if i==j:
+ continue
+ else:
+ if h2s[i] in h2s[j]:
+ removeArr.append(h2s[j])
+ #check against h1s
+ for k in range(len(h1s)):
+ if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]):
+ removeArr.append(h2s[i])
+ for x in removeArr:
+ h2s.remove(x)
+
+ #remove h3 duplicates
+ removeArr=[]
+ for i in range(len(h3s)):
+ #check internally
+ for j in range(len(h3s)):
+ if i==j:
+ continue
+ else:
+ if h3s[i] in h3s[j]:
+ removeArr.append(h3s[j])
+ #check against h1s and h2s
+ h1and2=h1s+h2s
+ for k in range(len(h1and2)):
+ if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]):
+ removeArr.append(h3s[i])
+ for x in removeArr:
+ h3s.remove(x)
+
+
+ return h1s, h2s, h3s
+
+
+
+def removalNotification(source, title, reason, value):
+ print('*************************')
+ print('\t\tSTORY REMOVED')
+ print('SOURCE: '+source)
+ print('TITLE: \t'+title)
+ print('REASON: '+reason)
+ print('VALUE: \t'+value)
+ print('*************************\n\n')
+
+
+def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None):
+
+ arr=[source.h1Arr, source.h2Arr, source.h3Arr]
+
+ if badTitleArr!=None:
+ for i in range(len(arr)):
+ for hed in arr[i]:
+ for item in badTitleArr:
+ if item in hed.title:
+ arr[i].remove(hed)
+ #if it's in the h1 slot, bump up the
+ # first h2 into the h1 slot
+ if i==0:
+ arr[0].append(arr[1][0])
+ arr[1].remove(arr[1][0])
+ removalNotification(source.name, hed.title, 'Title', item)
+
+
+ if badDescArr!=None:
+ for i in range(len(arr)):
+ for hed in arr[i]:
+ for item in badDescArr:
+ if item in hed.description:
+ arr[i].remove(hed)
+ #if it's in the h1 slot, bump up the
+ # first h2 into the h1 slot
+ if i==0:
+ arr[0].append(arr[1][0])
+ arr[1].remove(arr[1][0])
+ removalNotification(source.name, hed.title, 'Description', item)
+
+
+ if badAuthorArr!=None:
+ for i in range(len(arr)):
+ for hed in arr[i]:
+ for item in badAuthorArr:
+ if item in hed.author:
+ arr[i].remove(hed)
+ #if it's in the h1 slot, bump up the
+ # first h2 into the h1 slot
+ if i==0:
+ arr[0].append(arr[1][0])
+ arr[1].remove(arr[1][0])
+ removalNotification(source.name, hed.title, 'Author', item)
+
+
+ if badImgArr!=None:
+ for i in range(len(arr)):
+ for hed in arr[i]:
+ for item in badImgArr:
+ if item in hed.img:
+ arr[i].remove(hed)
+ #if it's in the h1 slot, bump up the
+ # first h2 into the h1 slot
+ if i==0:
+ arr[0].append(arr[1][0])
+ arr[1].remove(arr[1][0])
+ removalNotification(source.name, hed.title, 'Image', item)
+
+ if badURLArr!=None:
+ for i in range(len(arr)):
+ for hed in arr[i]:
+ for item in badURLArr:
+ if item in hed.url:
+ arr[i].remove(hed)
+ #if it's in the h1 slot, bump up the
+ # first h2 into the h1 slot
+ if i==0:
+ arr[0].append(arr[1][0])
+ arr[1].remove(arr[1][0])
+ removalNotification(source.name, hed.title, 'URL', item)
+
+ return source
+
+
+
+
+def buildTheHill():
+ url='http://thehill.com'
+ name='The Hill'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<div class="headline-story-image">', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[url+h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<div class="section-top-content">', 1)[1]
+ h2=h2.split('</ul>', 1)[0]
+ while '<div class="top-story-item' in h2 and len(h2s)<4:
+ h2=h2.split('<div class="top-story-item', 1)[1]
+ x=h2.split('<a href="', 1)[1]
+ x=x.split('"', 1)[0]
+ h2s.append(url+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<div class="section-top-content">', 1)[1]
+ h3=h3.split('</ul>', 1)[0]
+ while '<div class="top-story-item small' in h3:
+ h3=h3.split('<div class="top-story-item small', 1)[1]
+ x=h3.split('<a href="', 1)[1]
+ x=x.split('"', 1)[0]
+ h3s.append(url+x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ hil=buildNewsSource2(name, url, h1s, h2s, h3s)
+ #hil=removeBadStories(gdn, None, None, None, None)
+
+ return hil
+
+
+
+
+
+def buildGuardian():
+ url='http://www.theguardian.com/us-news'
+ name='The Guardian'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<h1', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ #only the h1 and the two h2s have this, so split on it and grab
+ #the second two
+ h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">', 3)[2:]
+ for x in h2:
+ x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
+ x=x.split('"', 1)[0]
+ h2s.append(x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<div class="fc-slice-wrapper">', 1)[1]
+ h3=h3.split('<div class="js-show-more-placeholder">', 1)[0]
+ #this story section goes on forever; just grab the first 5
+ while '<h2 class="fc-item__title"><a href="' in h3:
+ h3=h3.split('<h2 class="fc-item__title"><a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ h3s.append(x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+
+ gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
+ gdn=removeBadStories(gdn, None, ['Tom McCarthy'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
+
+ return gdn
+
+
+'''
+Function to fix the oddly short og:descriptions provided
+in The Blaze articles by grabbing the first portion of the story instead
+'''
+def blazeFixDesc(articleArr):
+ TAG_RE = re.compile(r'<[^>]+>')
+ for i in range(len(articleArr)):
+ desc=urlToContent(articleArr[i].url)
+ desc=desc.split('<div class="entry-content article-styles">', 1)[1]
+ desc=desc.split('<p>', 1)[1]
+ desc=TAG_RE.sub('', desc)
+ desc=desc.replace('\n', ' ')
+ desc=desc[:144]
+ articleArr[i].description=desc
+
+ return articleArr
+
+
+
+def buildBlaze():
+ url='http://theblaze.com'
+ name='The Blaze'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<!-- home -->', 1)[1]
+ h1=h1.split('<!-- loop-home -->', 1)[0]
+ h1=h1.split('<a class="gallery-link" href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[url+h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<!-- home -->', 1)[1]
+ h2=h2.split('<!-- loop-home -->', 1)[0]
+ while '</figure>\n\n<figure class="gallery-item">' in h2:
+ h2=h2.split('</figure>\n\n<figure class="gallery-item">', 1)[1]
+ h2=h2.split('href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(url+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<!-- loop-home -->', 1)[1]
+ #this story section goes on forever; just grab the first 5
+ while len(h3s)<5:
+ h3=h3.split('<a class="feed-link" href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append(url+x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+
+
+ blz=buildNewsSource2(name, url, h1s, h2s, h3s)
+ blz=removeBadStories(blz, None, ['Lawrence Jones'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka'], None)
+
+ #The Blaze has dumb, short description fields, so we need to grab
+ #the first x characters of actual article text instead
+ blz.h1Arr=blazeFixDesc(blz.h1Arr)
+ blz.h2Arr=blazeFixDesc(blz.h2Arr)
+ blz.h3Arr=blazeFixDesc(blz.h3Arr)
+
+ return blz
+
+
+
+def buildCBS():
+ url='http://cbsnews.com'
+ name='CBS News'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ if '<h1 class="title">' in content:
+ h1=h1.split('<h1 class="title">', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[url+h1]
+ else:
+ #for cases where they lead with a video, pull the first h2 as h1
+ h1=h1.split('Big News Area Side Assets', 1)[1]
+ h1=h1.split('</ul></div>', 1)[0]
+ h1=h1.split('<li data-tb-region-item>', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ x=h1.split('"', 1)[0]
+ h1s=[url+x]
+
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('Big News Area Side Assets', 1)[1]
+ h2=h2.split('</ul></div>', 1)[0]
+ while '<li data-tb-region-item>' in h2:
+ h2=h2.split('<li data-tb-region-item>', 1)[1]
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(url+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('Latest News', 1)[1]
+ #this story section goes on forever; just grab the first 5
+ while len(h3s)<5:
+ h3=h3.split('<li class="item-full-lead"', 1)[1]
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append(url+x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ return cbs
+
+
+
+
+
+def buildNBC():
+ url='http://nbcnews.com'
+ name='NBC News'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('top-stories-section', 1)[1]
+ h1=h1.split('panel_hero', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ if '.com' not in h1:
+ h1=url+h1
+ h1s=[h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('ad-content ad-xs mobilebox1', 1)[1]
+ h2=h2.split('taboola-native-top-stories-thumbnail', 1)[0]
+ while '<div class="story-link' in h2:
+ h2=h2.split('<div class="story-link', 1)[1]
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ if '.com' not in x:
+ x=url+x
+ h2s.append(x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('js-more-topstories', 1)[1]
+ h3=h3.split('<div class="panel-section', 1)[0]
+ while '<div class="story-link' in h3:
+ h3=h3.split('<div class="story-link', 1)[1]
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ if '.com' not in x:
+ x=url+x
+ h3s.append(x)
+
+ #adjust for today.com urls
+ for arr in [h1s, h2s, h3s]:
+ for i in range(len(arr)):
+ if 'today.com' in arr[i]:
+ arr[i]=arr[i].split('.com', 1)[1]
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ return nbc
+
+
+
+
+def buildBBC():
+ url='http://www.bbc.com/news/world/us_and_canada'
+ name='BBC US & Canada'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('buzzard-item', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=['http://www.bbc.com'+h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<div class="pigeon">', 1)[1]
+ h2=h2.split('<div id=', 1)[0]
+ while 'top_stories#' in h2:
+ h2=h2.split('top_stories#', 1)[1]
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append('http://www.bbc.com'+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<div class="macaw">', 1)[1]
+ h3=h3.split('Watch/Listen', 1)[0]
+ while '<div class="macaw-item' in h3:
+ h3=h3.split('<div class="macaw-item', 1)[1]
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append('http://www.bbc.com'+x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ #REMOVE ' - BBC News' from headlines
+ for i in range(len(bbc.h1Arr)):
+ if ' - BBC News' in bbc.h1Arr[i].title:
+ bbc.h1Arr[i].title=bbc.h1Arr[i].title.split(' - BBC News', 1)[0]
+ for i in range(len(bbc.h2Arr)):
+ if ' - BBC News' in bbc.h2Arr[i].title:
+ bbc.h2Arr[i].title=bbc.h2Arr[i].title.split(' - BBC News', 1)[0]
+ for i in range(len(bbc.h3Arr)):
+ if ' - BBC News' in bbc.h3Arr[i].title:
+ bbc.h3Arr[i].title=bbc.h3Arr[i].title.split(' - BBC News', 1)[0]
+
+ return bbc
+
+
+
+def buildWeeklyStandard():
+ url='http://www.weeklystandard.com'
+ name='Weekly Standard'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<div id="region_1"', 1)[1]
+ h1=h1.split('<div id="region_2"', 1)[0]
+ h1=h1.split('<div class="lead-photo">', 1)[1]
+ h1=h1.split('href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<div class="widget lead-story layout-3col-feature" data-count="2">', 1)[1]
+ h2=h2.split('<div id="region_2"', 1)[0]
+ while '<div class="lead-photo">' in h2:
+ h2=h2.split('<div class="lead-photo">', 1)[1]
+ h2=h2.split('href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('Today\'s Standard', 1)[1]
+ h3=h3.split('<div id="region_3"', 1)[0]
+ while '<div class="lead-photo">' in h3:
+ h3=h3.split('<div class="lead-photo">', 1)[1]
+ h3=h3.split('href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append(x)
+
+ #Need to add URL prefix to all URLs
+ for i in range(len(h1s)):
+ h1s[i]=url+h1s[i]
+ for i in range(len(h2s)):
+ h2s[i]=url+h2s[i]
+ for i in range(len(h3s)):
+ h3s[i]=url+h3s[i]
+
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ wkl=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ #REMOVE BAD STORIES
+ badTitleArr=None
+ ## if flagged again, remove Micah Mattix
+ badDescArr=['Matt Labash']
+ badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY']
+ badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png']
+ wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr)
+
+ return wkl
+
+
+
+
+def buildNPR():
+ url='http://www.npr.org/sections/news/'
+ name='NPR'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<a id="mainContent">', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<article class="item has-image">', 1)[1]
+ h2=h2.split('<!-- END CLASS=\'FEATURED-3-UP\' -->', 1)[0]
+ while '<article class="item has-image">' in h2:
+ h2=h2.split('<article class="item has-image">', 1)[1]
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<div id="overflow" class="list-overflow"', 1)[1]
+ h3=h3.split('<!-- END ID="OVERFLOW" CLASS="LIST-OVERFLOW"', 1)[0]
+ while '<h2 class="title"><a href="' in h3:
+ h3=h3.split('<h2 class="title"><a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append(x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+
+ npr=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ #REMOVE BAD STORIES
+ badTitleArr=None
+ badDescArr=None
+ badAuthorArr=None
+ badImgArr=None
+ #npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr)
+
+ return npr
+
+
+
+
+def buildFoxNews():
+ url='http://foxnews.com'
+ name='Fox News'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<h1><a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<div class="top-stories">', 1)[1]
+ h2=h2.split('<section id="latest"', 1)[0]
+ while '<li data-vr-contentbox=""><a href="' in h2:
+ h2=h2.split('<li data-vr-contentbox=""><a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('div id="big-top"', 1)[1]
+ h3=h3.split('<div class="top-stories">', 1)[0]
+ while '<a href="' in h3:
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append(x)
+
+ h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
+ fox=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ #REMOVE BAD STORIES
+ badTitleArr=['O'Reilly']
+ badDescArr=None
+ badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
+ badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
+ badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com']
+ fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
+
+ return fox
+
+
+
+def buildNYT():
+ url='http://www.nytimes.com'
+ name='New York Times'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ #this will likely need if/else logic
+ h1=content
+
+ if 'story theme-summary banner' in h1:
+ #This is with a large headline over a and b columns
+ h1=h1.split('story theme-summary banner', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ else:
+ #otherwise, pull the first story from the A column
+ h1=h1.split('<div class="a-column column">', 1)[1]
+ h1=h1.split('<a href="', 1)[1].split('"', 1)[0]
+ h1s=[h1]
+
+
+ #GET SECONDARY HEADLINES
+ #This comes from the a column or b column, above the break
+ h2=content
+ h2s=[]
+ #A column
+ h2=h2.split('<div class="a-column column">', 1)[1]
+ h2=h2.split('<!-- close a-column -->', 1)[0]
+ #remove "collection" sets
+ while '<div class="collection headlines">' in h2:
+ arr=h2.split('<div class="collection headlines">', 1)
+ h2=arr[0]+arr[1].split('</ul>', 1)[1]
+ #Grab the remaining URLs
+ while '<a href="' in h2:
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(x)
+
+ #B column
+ h2=content
+ h2=h2.split('<div class="b-column column">', 1)[1]
+ h2=h2.split('<!-- close b-column -->', 1)[0]
+ #remove "collection" sets
+ while '<div class="collection headlines">' in h2:
+ arr=h2.split('<div class="collection headlines">', 1)
+ h2=arr[0]+arr[1].split('</ul>', 1)[1]
+ #Grab the remaining URLs
+ while '<a href="' in h2:
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if (h1 not in x) and (x not in h2s):
+ h2s.append(x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<!-- close lede-package-region -->', 1)[1]
+ h3=h3.split('<a href="https://www.nytimes.com/tips">', 1)[0]
+ #remove "collection" sets
+ while '<div class="collection headlines">' in h2:
+ arr=h3.split('<div class="collection headlines">', 1)
+ h3=arr[0]+arr[1].split('</ul>', 1)[1]
+
+ #Grab the remaining URLs
+ while '<a href="' in h3:
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if (h1 not in x) and (x not in h3s):
+ h3s.append(x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ nyt=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ return nyt
+
+
+
+
+'''
+NYT
+EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS
+
+<div class="span-ab-layout layout">
+
+ <div class="ab-column column">
+
+ <section id="top-news" class="top-news">
+ <h2 class="section-heading visually-hidden">Top News</h2>
+
+ <div class="above-banner-region region">
+
+ <div class="collection">
+ <div class="hpHeader" id="top-megapackage-kicker">
+ <h6><a href="http://www.nytimes.com/pages/politics/index.html?src=hpHeader">The 45th President</a></h6>
+</div>
+
+</div>
+
+ </div><!-- close above-banner-region -->
+
+ <div class="span-ab-top-region region">
+
+ <div class="collection">
+ <article class="story theme-summary banner" id="topnews-100000004932040" data-story-id="100000004932040" data-rank="0" data-collection-renderstyle="Banner">
+ <h1 class="story-heading"><a href="https://www.nytimes.com/2017/02/14/us/politics/fbi-interviewed-mike-flynn.html">F.B.I. Questioned Flynn About Russia Call</a></h1>
+</article>
+</div>
+
+ </div><!-- close span-ab-top-region -->
+'''
|