1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
#!/usr/bin/env python3
from unbiasedObjects import *
from unbiasedFunctions import *
import time
def main():
while True:
print('-----------------------')
run()
print('-----------------------')
time.sleep(120)
def run():
sourceList=[]
sourceList.append(NewsSource('BBC US',
'http://www.bbc.com/news/world/us_and_canada',
['buzzard-item', '<a href="'],
['top_stories#', '<a href="'],
[],
None, None,
'<div class="pigeon">','<div id=',
None, None,
'http://www.bbc.com'))
sourceList.append(NewsSource('NBC News',
'http://nbcnews.com',
['top-stories-section', 'panel_hero', '<a href="'],
['<div class="story-link', '<a href="'],
[],
None, None,
'ad-content ad-xs mobilebox1', 'taboola-native-top-stories-thumbnail',
None, None))
sourceList.append(NewsSource('CBS News',
'http://cbsnews.com',
['<a href="'],
['<li data-tb-region-item>', '<a href="'],
[],
'Big News Area Side Assets', '</a>'
'Big News Area Side Assets', '</ul></div>',
None, None))
sourceList.append(NewsSource('The Blaze',
'http://theblaze.com',
['<a class="gallery-link" href="'],
['</figure>\n\n<figure class="gallery-item">', 'href="'],
[],
'<!-- home -->', '<!-- loop-home -->',
'<!-- home -->', '<!-- loop-home -->',
None, None))
sourceList.append(NewsSource('Weekly Standard',
'http://www.weeklystandard.com/',
['<div class="lead-photo">', 'href="'],
['<div class="lead-photo">', 'href="'],
[],
'<div id="region_1"', '<div id="region_2"',
'<div class="widget lead-story layout-3col-feature" data-count="2">', '<div id="region_2"',
None, None))
sourceList.append(NewsSource('New York Times',
'http://nytimes.com',
['<a href="'],
['<article class="story theme-summary"', '<a href="'],
['<hr class="single-rule"', 'article class="story theme-summary', 'h2 class="story-heading"><a href="'],
'<div class="b-column column">', '<!-- close photo-spot-region -->',
'section id="top-news" class="top-news"', '</div><!-- close a-column -->',
'class="second-column-region region"', 'html.geo-dma-501 .nythpNYRegionPromo'))
sourceList.append(NewsSource('Fox News',
'http://foxnews.com',
['<h1><a href="'],
['<li data-vr-contentbox=""><a href="'],
[],
None, None,
'<div class="top-stories">', '<section id="latest"',
None, None))
#scrape all urls and build data structure
newsSourceArr=buildNewsSourceArr(sourceList)
#build the output file HTML
outputHTML=buildOutput(newsSourceArr)
#print the output file HTML
printOutputHTML(outputHTML, '/var/www/html/index.html')
if __name__=="__main__":
main()
|