Google News Scraper

This Google news scraper can scrape news based on a given keyword and the relevant searches are put in a CSV for the download.

Install necessary packages:

pip install requests
pip install beautifulsoup4

Install the necessary libraries:

import requests
from bs4 import BeautifulSoup
import pandas as pd

Copy and paste the following code:

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Search Query
query = 'Indian Stock Market'

# Encode special characters in a text string
def encode_special_characters(text):
    encoded_text = ''
    special_characters = {'&': '%26', '=': '%3D', '+': '%2B', ' ': '%20'}  # Add more special characters as needed
    for char in text.lower():
        encoded_text += special_characters.get(char, char)
    return encoded_text

query2 = encode_special_characters(query) # the query given above as string encoded as special characters
url = f"https://news.google.com/search?q={query2}&hl=en-US&gl=US&ceid=US%3Aen" # the complete URL with the query

response = requests.get(url) # html response
soup = BeautifulSoup(response.text, 'html.parser') # parsing the html code

articles = soup.find_all('article')
links = [article.find('a')['href'] for article in articles]
links = [link.replace("./articles/", "https://news.google.com/articles/") for link in links]

news_text = [article.get_text(separator='\n') for article in articles]
news_text_split = [text.split('\n') for text in news_text]

news_df = pd.DataFrame({
    'Title': [text[2] for text in news_text_split],
    'Source': [text[0] for text in news_text_split],
    'Time': [text[3] if len(text) > 3 else 'Missing' for text in news_text_split],
    'Author': [text[4].split('By ')[-1] if len(text) > 4 else 'Missing' for text in news_text_split],
    'Link': links
}) # converting the responses into a data frame and writing it as a .csv file

# Write to CSV
news_df.to_csv('news.csv', index=False)

Leave a Comment