Github Link: https://github.com/guruprasadbotics/google_news_scraper_tocsv/tree/main
This Google news scraper can scrape news based on a given keyword and the relevant searches are put in a CSV for the download.
Install necessary packages:
pip install requests
pip install beautifulsoup4
Install the necessary libraries:
import requests
from bs4 import BeautifulSoup
import pandas as pd
Copy and paste the following code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Search Query
query = 'Indian Stock Market'
# Encode special characters in a text string
def encode_special_characters(text):
encoded_text = ''
special_characters = {'&': '%26', '=': '%3D', '+': '%2B', ' ': '%20'} # Add more special characters as needed
for char in text.lower():
encoded_text += special_characters.get(char, char)
return encoded_text
query2 = encode_special_characters(query) # the query given above as string encoded as special characters
url = f"https://news.google.com/search?q={query2}&hl=en-US&gl=US&ceid=US%3Aen" # the complete URL with the query
response = requests.get(url) # html response
soup = BeautifulSoup(response.text, 'html.parser') # parsing the html code
articles = soup.find_all('article')
links = [article.find('a')['href'] for article in articles]
links = [link.replace("./articles/", "https://news.google.com/articles/") for link in links]
news_text = [article.get_text(separator='\n') for article in articles]
news_text_split = [text.split('\n') for text in news_text]
news_df = pd.DataFrame({
'Title': [text[2] for text in news_text_split],
'Source': [text[0] for text in news_text_split],
'Time': [text[3] if len(text) > 3 else 'Missing' for text in news_text_split],
'Author': [text[4].split('By ')[-1] if len(text) > 4 else 'Missing' for text in news_text_split],
'Link': links
}) # converting the responses into a data frame and writing it as a .csv file
# Write to CSV
news_df.to_csv('news.csv', index=False)