También pasa en el subforo de la 3GMPasa en los hilos que están en Archivo. Es el principio del fin.
Empezó en Archivo y Coronabicho, y después se ha ido extendiendo a otros hilos.También pasa en el subforo de la 3GM
Si, a todosA alguien más le aparece esto?
Pesan mucho y los tengo dispersos en carpetas por temáticas, por ahora paso.Sube a algún sitio comprimidos los hilos que hayas guardado.
python script.py <html_file>
#!/usr/bin/env python
"""Elimina firmas, encabezados y otros elementos para hacer menos pesado el html de Burbuja.info"""
import sys
from bs4 import BeautifulSoup
from datetime import datetime
if len(sys.argv) != 2:
print("Usage: python script.py <html_file>")
sys.exit(1)
html_file_path = sys.argv[1]
html_file_output=sys.argv[1] +'comp.html'
# Read the HTML content from the file
with open(html_file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
# Read the HTML content from the file
#with open('file.html', 'r', encoding='utf-8') as file:
# html_content = file.read()
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')
# Find and remove elements with class 'message-signature' and 'message-actionBar'
classes_to_remove = ['message-signature', 'message-actionBar','p-sectionLinks','p-nav-inner','p-footer','message message--quickReply block-topRadiusContent block-bottomRadiusContent','message-attribution-opposite message-attribution-opposite--list']
for class_name in classes_to_remove:
for element in soup.find_all(class_=class_name):
element.decompose()
try:
# Find all time elements with the specific format
time_elements = soup.find_all('time', class_='u-dt', text=lambda text: 'ayer' in text.lower())
# Replace each time element with the current date and time
for time_element in time_elements:
# Extract the date and time from the element attributes
date_string = time_element['data-date-string']
time_string = time_element['data-time-string']
# Combine the date and time strings and parse into datetime object
datetime_str = f'{date_string} {time_string}'
parsed_datetime = datetime.strptime(datetime_str, '%d %b %Y %I:%M %p')
# Get the current date and time
current_datetime = datetime.now()
# Replace the element attributes with the current date and time
time_element['data-date-string'] = current_datetime.strftime('%d %b %Y')
time_element['data-time-string'] = current_datetime.strftime('%I:%M %p')
time_element['datetime'] = current_datetime.strftime('%Y-%m-%dT%H:%M:%S%z')
time_element['title'] = current_datetime.strftime('%d %b %Y a la(s) %I:%M %p')
# Update the element text
updated_time_str = parsed_datetime.strftime('%d %b %Y a la(s) %I:%M %p')
time_element.string = updated_time_str
except:
print("time error")
# Write the modified HTML back to the file
with open(html_file_output, 'w', encoding='utf-8') as file:
file.write(soup.prettify())
#!/usr/bin/env python
# Extrae solo texto plano (revisar citas a usuarios)
import re
import requests
from bs4 import BeautifulSoup
def extract_messages(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
messages = []
# Find all message content elements
message_contents = soup.find_all('div', class_='message-content js-messageContent')
# Extract the text content of each message and add to the list
for message_content in message_contents:
message_text = message_content.get_text(strip=True)
cleaned_message = remove_expandable_content(message_text)
messages.append(cleaned_message)
return messages
def extract_users(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
users = []
# Find all message user details elements
user_details = soup.find_all('div', class_='message-userDetails')
# Extract the text content of each user detail and add to the list
for user_detail in user_details:
user_text = user_detail.get_text(strip=True)
cleaned_user = remove_expandable_content(user_text)
users.append(cleaned_user)
return users
def remove_expandable_content(text):
# Define the pattern to match strings starting with 'dijo:' and ending with 'click para expandir...'
pattern = re.compile(r'dijo:.*?clic para expandir\.\.\.', re.DOTALL)
# Remove the matched content from text
cleaned_text = re.sub(pattern, ' fue citado ]\n ', text)
return cleaned_text
if __name__ == "__main__":
import sys
if len(sys.argv) != 2:
print("Usage: python script.py <html_file_or_url>")
sys.exit(1)
input_path = sys.argv[1]
if input_path.startswith('http://') or input_path.startswith('https://'):
# Fetch HTML content from the URL
response = requests.get(input_path)
html_content = response.text
else:
# Read HTML content from the local file
with open(input_path, 'r', encoding='utf-8') as file:
html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')
# Encuentra y elimina todos los elementos <h5> con la clase especificada
user_titles = soup.find_all('h5', class_='userTitle message-userTitle')
for user_title in user_titles:
user_title.decompose()
# Extract and process messages from the modified HTML content
users = extract_users(str(soup))
messages = extract_messages(str(soup))
# Print the extracted and processed messages
for i in range(0, len(messages)):
print(f"{i+1}:({users[i]}): {messages[i]}\n")
#!/usr/bin/env python
# Extrae enlaces de hilos de burbuja.info cuyos títulos contienen cifras numéricas en el subforo de Economía
# Salida: CSV
import requests
import re
from bs4 import BeautifulSoup
# Make a request to the webpage
enlaces=[]
titulos=[]
id=0
for i in range(1,10):
website="https://www.burbuja.info/inmobiliaria/forums/economia/page-"+str(i)
page = requests.get(website)
patterns=r'\d+'
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(page.content, "html.parser")
# Find all the links on the page
links = soup.find_all("a")
# Extract the href attribute from each link
for link in links:
url='https://www.burbuja.info'+str(link.get("href"))
title=str(link.string)
timer=bool(re.search(r'\d{1,2}\s\w{3}\s\d{4}|hace \d+ minutos| a la\(s\) |Hace un momento|Hace 1 minuto',title)) #print(timer,title,'\n')
patron=bool(re.search(patterns,title))
if 'threads' in url and timer==False and patron==True and len(title)>4: #and len(title)>25:
#print(title,'# ',url)
enlaces.append([title,url])
titulos.append([title])
#print("")
#print(link)
import pandas
import os
pandas.DataFrame(enlaces).to_csv('/usr/share/nginx/html/static/bbja.csv',sep=';')
A pagaaaaarrrA alguien más le aparece esto?
Requires account upgrade to view this reply¿Cual era el foro alternativo que hicieron los foreros,?, por lo menos allí estaremos mejor comunicados hasta que sepamos a ciencia cierta lo que está pasando en burbuja
En la cuenta premium hay un mod para ver desnudos a los foreros.