-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathreuters_scraper.py
381 lines (310 loc) · 14.7 KB
/
reuters_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
# Dependencies
# built-ins
import os # making and reading directories
import time # for wait functions
import multiprocessing # get how many CPUs are in your PC
# 3rd-party
import pandas as pd # for data processing and .csv I/O
from tqdm import tqdm # for progress bars
from newspaper import Article # for parsing Reuters articles
from selenium import webdriver # for web scraping
from datetime import datetime # for getting today's date
from joblib import Parallel, delayed # for parallel processing
# Functions
def get_data_for_stock(stock, verbose = False):
# get_data_for_stock()
# Takes input "stock" and outputs a {stock}.csv file to the reuters_data directory.
# Also, a folder called "processed" contains more folders whose names are of the
# stocks that have already been processed. This is for making it easy to just run the
# script again if it is stopped (e.g. your PC crashes, you have to kill the script)
# Input: stock (str) - ticker symbol of a designated stock
# Output: None
# Set the Firefox webdriver to run headless in the background
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless()
if massive_scrape_mode:
if stock.replace('.', '_') not in os.listdir('processed'):
os.mkdir('processed/{}'.format(stock.replace('.', '_'))) # Instantiate a folder whose name is {stock}
# to mark that this stock has already been processed
driver = webdriver.Firefox(options = fireFoxOptions) # Initialize the webdriver instance in the background
try:
# Search Reuters for {stock}
driver.get('https://www.reuters.com/search/news?blob={}'.format(stock))
time.sleep(2) # Wait for the page to load
# Reuters should query the company if they have written articles on it.
# If they haven't, an error will be thrown and no files will be outputted
# This line gets the company they queried's element which contains
# the stock's name and URL to Reuters's page on the stock
text = driver.find_element_by_xpath('/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a').text
# {condition} will determine if the stock queried is actually
# the stock that we're trying to get articles on
condition = False
# Reuters will format the element's text in 2 ways:
# {company name} ({ticker}.{some additional text})
# e.g.
# Apple Inc (AAPL.OQ)
# or
# {company name} ({ticker})
# e.g.
# Alcoa Corp (AA)
# The following lines of code will filter down the element's text
# to just get {ticker}
# e.g.
# Apple Inc (AAPL.OQ) --> AAPL
# Aloca Corp (AA) --> AA
if '.' in text: # Check if a period is in the text
# Check if {ticker}.upper() == {stock}.upper()
if text[text.find('(') + 1:text.find('.')].upper() == stock.upper():
condition = True # {condition} = True means that the queried stock is a match
else: # If there is no period
# Check if {ticker}.upper() == {stock}.upper()
if text[text.find('(') + 1: text.find(')')].upper() == stock.upper():
condition = True # {condition} = True means that the queried stock is a match
if condition: # If {stock} has been found in Reuters, continue
if verbose:
print('Stock was found on Reuters.')
# Click the element's link, going to Reuters's
driver.find_element_by_xpath('/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a').click()
time.sleep(0.5) # Let the stock's Reuters page load
# Go to the "News" section of the stock's Reuters page
driver.find_element_by_xpath('/html/body/div[1]/div/div[3]/div/div/nav/div[1]/div/div/ul/li[2]/button').click()
time.sleep(5)
# The next segment will scroll down to the bottom of the "News"
# page of the stock's Reuters page
SCROLL_PAUSE_TIME = 0.5 # This is how much time it waits before scrolling to
# the bottom of the page again
# Get the last height of the page
last_height = driver.execute_script("return document.body.scrollHeight")
it_num = 0
if verbose:
print('Scrolling to the bottom of the news page...')
while True:
if verbose:
if it_num % 10 == 0:
print('{} - Scroll: Iteration #{}'.format(stock, it_num))
it_num += 1
# Scroll to the bottom of the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait for more content to load
time.sleep(SCROLL_PAUSE_TIME)
# Get the current height of the page
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# If the current height of the page is the same as it was before,
# break the script because there is no more content to load.
break
last_height = new_height # Get the previous height of the page
if verbose:
print('Scroll completed.')
i = 1 # Article index number starts at one in the HTML
datas = [] # Put all of the data in here
tol = 0 # Tol is the amount of times an error has been thrown (for this stock news page).
# When it hits 3, it's confirmed that all articles for this stock
# have been queried.
# The amount of articles on the page is unknown, so a while loop is
# used to iterate until no more new articles are found
if verbose:
print('Scraping the site...')
it_num = 0
while True:
try:
if verbose:
if it_num % 10 == 0:
print('{} - Scrape: Iteration #{}'.format(stock, it_num))
it_num += 1
# This is the xpath for the title of the article
xpath = '/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[{}]/div/a'.format(i)
i += 1
# An error will be thrown if there are no more articles left because
# the driver won't be able to find the non-existent next article.
header = driver.find_element_by_xpath(xpath).text # Get the article's header text
link = driver.find_element_by_xpath(xpath).get_attribute('href') # Get the link of the article
# The links are processed by a seperate script because
# less threads have to be devoted to
datas.append([header, link])
except Exception as e:
tol += 1 # Increase the error tally by 1
time.sleep(30)
# print(datas)
if tol >= 2: # The script will only break if 3 errors in a row are thrown to confirm
# it's actually found all the articles
break
datas = pd.DataFrame(datas, columns = ['text', 'link']) # Compile the list of headers and links into a pandas DataFrame
if verbose == False:
print('Scraping for further information....')
links_data = Parallel(1, 'threading', verbose = 0)(delayed(convert_link_to_data)(link) for link in datas['link'].values.tolist())
links_data = pd.DataFrame(links_data, columns = ['author', 'publish_date', 'body_text'])
if massive_scrape_mode == True:
links_data.to_csv('reuters_data/{}.csv'.format(stock.replace('.', '_'))) # Export the data to the reuters data folder under the name {stock}.csv
else:
pass
else:
if verbose:
print('Stock not found on reuters.')
# Stop the driver
driver.quit()
try:
if massive_scrape_mode == False:
return links_data
except:
if massive_scrape_mode == False:
return False
except Exception as e:
print(e)
time.sleep(30) # Make this worker wait a bit before killing incase Reuters.com
# is acting up
driver.quit() # The webdriver will be killed upon receiving an error
# to save space on RAM
# get_data_for_stock('ABT')
def convert_link_to_data(link):
# convert_link_to_data()
# Given a Reuters article link, it will parse the article for authors,
# the article's publish date, and the article's content.
# Input: link (str) - link to a designated Reuters article
# Output: authors (list of strs), date article was published (pd.Timestamp, UTC+0),
# and full, raw article content
try:
article = Article(link) # Instantiate the Article() object
article.download() # Download the article
article.parse() # Parse the article for data
authors = article.authors # Get the article's authors
publish_date = article.publish_date # Get the date the article was published on
text = article.text # Get the article's main text
return [authors, publish_date, text]
except Exception as e:
# If the article isn't found, an error will be thrown and
# NaNs will be outputted.
return [np.nan, np.nan, np.nan]
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from newspaper import Article
from selenium import webdriver
def get_data_for_stock_lb_base(stock: str, days_to_look_back: int):
# get_data_for_stock()
# Takes input "stock" and outputs a {stock}.csv file to the reuters_data directory.
# Also, a folder called "processed" contains more folders whose names are of the
# stocks that have already been processed. This is for making it easy to just run the
# script again if it is stopped (e.g. your PC crashes, you have to kill the script)
# Input: stock (str) - ticker symbol of a designated stock
# Output: None
# Set the Firefox webdriver to run headless in the background
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless()
driver = webdriver.Firefox(options = fireFoxOptions) # Initialize the webdriver instance in the background
try:
# Search Reuters for {stock}
driver.get('https://www.reuters.com/search/news?blob={}'.format(stock))
time.sleep(2) # Wait for the page to load
# Reuters should query the company if they have written articles on it.
# If they haven't, an error will be thrown and no files will be outputted
# This line gets the company they queried's element which contains
# the stock's name and URL to Reuters's page on the stock
text = driver.find_element_by_xpath('/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a').text
# {condition} will determine if the stock queried is actually
# the stock that we're trying to get articles on
condition = False
# Reuters will format the element's text in 2 ways:
# {company name} ({ticker}.{some additional text})
# e.g.
# Apple Inc (AAPL.OQ)
# or
# {company name} ({ticker})
# e.g.
# Alcoa Corp (AA)
# The following lines of code will filter down the element's text
# to just get {ticker}
# e.g.
# Apple Inc (AAPL.OQ) --> AAPL
# Aloca Corp (AA) --> AA
if '.' in text: # Check if a period is in the text
# Check if {ticker}.upper() == {stock}.upper()
if text[text.find('(') + 1:text.find('.')].upper() == stock.upper():
condition = True # {condition} = True means that the queried stock is a match
else: # If there is no period
# Check if {ticker}.upper() == {stock}.upper()
if text[text.find('(') + 1: text.find(')')].upper() == stock.upper():
condition = True # {condition} = True means that the queried stock is a match
if condition: # If {stock} has been found in Reuters, continue
# Click the element's link, going to Reuters's
driver.find_element_by_xpath('/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a').click()
time.sleep(0.5) # Let the stock's Reuters page load
# Go to the "News" section of the stock's Reuters page
driver.find_element_by_xpath('/html/body/div[1]/div/div[3]/div/div/nav/div[1]/div/div/ul/li[2]/button').click()
time.sleep(5)
# The next segment will scroll down to the bottom of the "News"
# page of the stock's Reuters page
SCROLL_PAUSE_TIME = 0.5 # This is how much time it waits before scrolling to
# the bottom of the page again
# Get the last height of the page
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll to the bottom of the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait for more content to load
time.sleep(SCROLL_PAUSE_TIME)
# Get the current height of the page
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# If the current height of the page is the same as it was before,
# break the script because there is no more content to load.
break
last_height = new_height # Get the previous height of the page
i = 1 # Article index number starts at one in the HTML
datas = [] # Put all of the data in here
tol = 0 # Tol is the amount of times an error has been thrown (for this stock news page).
# When it hits 3, it's confirmed that all articles for this stock
# have been queried.
# The amount of articles on the page is unknown, so a while loop is
# used to iterate until no more new articles are found
while True:
try:
# This is the xpath for the title of the article
xpath = '/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[{}]/div/a'.format(i)
# /html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[4]/div/div/time
i += 1
# An error will be thrown if there are no more articles left because
# the driver won't be able to find the non-existent next article.
header = driver.find_element_by_xpath(xpath).text # Get the article's header text
link = driver.find_element_by_xpath(xpath).get_attribute('href') # Get the link of the article
date = driver.find_element_by_xpath('/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[{}]/div/div/time'.format(i)).text
#/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[7]/div/div/time
datas.append([header, link])
# # print(header, link, date)
try:
units_behind = date[::-1]
units_behind = units_behind[units_behind.find(' ') + 1:][::-1]
units_behind = pd.Timedelta(units_behind)
if units_behind > pd.Timedelta('{} days'.format(days_to_look_back)):
break
except:
pass
# The links are processed by a seperate script because
# less threads have to be devoted to
except Exception as e:
# # print(e)
tol += 1 # Increase the error tally by 1
time.sleep(30)
if tol > 2: # The script will only break if 3 errors in a row are thrown to confirm
# it's actually found all the articles
break
datas = pd.DataFrame(datas, columns = ['text', 'link']) # Compile the list of headers and links into a pandas DataFrame
# datas.to_csv('reuters_data/{}.csv'.format(stock)) # Export the data to the reuters data folder under the name {stock}.csv
# Stop the driver
driver.quit()
return datas
except Exception as e:
time.sleep(30) # Make this worker wait a bit before killing incase Reuters.com
# is acting up
driver.quit() # The webdriver will be killed upon receiving an error
# to save space on RAM
return np.nan
def get_data_for_stock_with_lookback(stock: str, days_to_look_back: int):
news_releases = get_data_for_stock_lb_base(stock, days_to_look_back)
datas = []
for link in news_releases['link']:
data = convert_link_to_data(link)
datas.append(data)
return pd.DataFrame(datas, columns = ['author', 'publish_date', 'text'])