2022-03-30

How can I scrape the content of a news, based on its title?

I have a Listbox where the titles and news time are scraped from 2 links and printed in the Listbox after clicking on the "View Title" button. This works correctly. All ok!

enter image description here

Now I would like to select the newspaper title from the Listbox, click on the "View Content" button, and view the news content in a multiline textbox. So I would like to view the content of the news of the selected title in the textbox below. I specify that the title is the same as the link of the news content. But I have a problem with the function to build this:

def content():
    if title.select:

        #click on title-link
        driver.find_element_by_tag_name("title").click()

        #Download Content to class for every title
        content_download =(" ".join([span.text for span in div.select("text mbottom")]))

        #Print Content in textobox
        textbox_download.insert(tk.END, content_download)

So I imagined that to get this, we would have to simulate clicking on the title of the news to open it (in html it is title), then select the text of the content (in html it is text mbottom) and then copy it in the tetbox of my file. It should be so? What are you saying? Obviously I have poorly written the code and it doesn't work. I'm not very good at scraping. Could anyone help me? Thank you

The complete code is this (is executable correctly and scrapes titles and now. I don't call the content function in the button). Aside from the above function, the code is working good and fetches the title and news time

from tkinter import *
from tkinter import ttk
import tkinter as tk
import sqlite3
import random
import tkinter.font as tkFont
from tkinter import ttk

window=Tk()
window.title("x")
window.geometry("800x800")

textbox_title = tk.Listbox(window, width=80, height=16, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_title.place(x=1, y=1)

textbox_download = tk.Listbox(window, width=80, height=15, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_download.place(x=1, y=340)

#Download All Titles and Time
def all_titles():

    allnews = []

    import requests
    from bs4 import BeautifulSoup

    # mock browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }


    #ATALANTA
    site_atalanta = requests.get('https://www.tuttomercatoweb.com/atalanta/', headers=headers)
    soup = BeautifulSoup(site_atalanta.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for each in news:
        for div in each.find_all("div"):
            time= (div.find('span', attrs={'class': 'hh serif'}).text)
            title=(" ".join([span.text for span in div.select("a > span")]))

            news = (f" {time} {'ATALANTA'}, {title} (TMW)")
            allnews.append(news)        


    #BOLOGNA
    site_bologna = requests.get('https://www.tuttomercatoweb.com/bologna/', headers=headers)
    soup = BeautifulSoup(site_bologna.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for each in news:
        for div in each.find_all("div"):
            time= (div.find('span', attrs={'class': 'hh serif'}).text)
            title=(" ".join([span.text for span in div.select("a > span")]))

            news = (f" {time} {'BOLOGNA'}, {title} (TMW)")
            allnews.append(news)           
                            

    allnews.sort(reverse=True)

    for news in allnews:
        textbox_title.insert(tk.END, news)

#Download Content of News
def content():
    if titolo.select:

        #click on title-link
        driver.find_element_by_tag_name("title").click()

        #Download Content to class for every title
        content_download =(" ".join([span.text for span in div.select("text mbottom")]))

        #Print Content in textobox
        textbox_download.insert(tk.END, content_download)



button = tk.Button(window, text="View Titles", command= lambda: [all_titles()])
button.place(x=1, y=680)

button2 = tk.Button(window, text="View Content", command= lambda: [content()])
button2.place(x=150, y=680)

window.mainloop()


No comments:

Post a Comment