我编写了一个网页抓取脚本,使用正则表达式和 selenium 来查找房屋公司的联系方式,我没有使用其他网页抓取工具的经验,并且我愿意接受所有的批评和建议......
我制作了一个网页抓取脚本,使用正则表达式和 selenium 来查找房屋公司的联系方式,我没有使用其他网页抓取工具的经验,并且我愿意接受所有的批评和建议。
我需要帮助以更一致地找到正确的详细信息,此外如果有人有适用于英国电话号码的正则表达式。
type hfrom selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import re
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import os
from selenium.common.exceptions import TimeoutException
# Path to the extension .crx file
extension_path = 'xxx'
chromedriver_path = '/usr/local/bin/chromedriver'
# Verify the extension file exists
if not os.path.exists(extension_path):
raise FileNotFoundError(f"The extension file at {extension_path} does not exist.")
# Set up Chrome options
chrome_options = Options()
chrome_options.add_extension(extension_path)
# Initialize ChromeDriver with the extension
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)
# Set page load timeout
driver.set_page_load_timeout(20)
# Regular expressions
phone_regex = re.compile(
r'\+44\s?\d{4}\s?\d{6}|\(?0\d{4}\)?\d{3}\s?\d{3}|\(?0\d{3}\)?\d{3}\s?\d{4}|\(?0\d{2}\)?\d{4}\s?\d{4}'
)
email_regex = re.compile(
r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
)
def locate_contact():
try:
contact_element = driver.find_element(By.XPATH, "//*[contains(text(), 'Contact us') or contains(text(), 'Contact') or contains(text(), 'Get in touch')]")
print("Contact us link found")
except Exception as e:
print(f"Couldn't find a contact page: {e}")
return
def get_contact():
page_source = driver.page_source
phone_numbers = phone_regex.findall(page_source)
emails = email_regex.findall(page_source)
phone_numbers = [num for num in phone_numbers if len(num) > 6 and len(num) < 15]
phone_numbers = phone_numbers[:3]
phone_numbers = list(set(phone_numbers))
emails = emails[:3]
emails = list(set(emails))
main_page_source = driver.page_source
phone_numbers, emails = retry_missing_details(driver, phone_numbers, emails, main_page_source)
housing_name = driver.title[:53]
housing_name = housing_name.replace(',', '|')
housing_name = housing_name.replace('-', '|')
print(housing_name)
print(f"Found emails: {emails}")
print(f"Found phone numbers: {phone_numbers}")
print('----------------------------------------------')
all_data.append([housing_name, ', '.join(emails), ', '.join(phone_numbers)])
def retry_missing_details(driver, phone_numbers, emails, main_page_source):
if not phone_numbers or not emails:
print("Retrying main page for missing details")
new_phone_numbers = phone_regex.findall(main_page_source)
new_emails = email_regex.findall(main_page_source)
new_phone_numbers = [num for num in new_phone_numbers if len(num) > 6 and len(num) < 15]
new_phone_numbers = list(set(new_phone_numbers))
new_emails = list(set(new_emails))
if not phone_numbers:
phone_numbers = new_phone_numbers[:3]
if not emails:
emails = new_emails[:3]
return phone_numbers, emails
# List to store all data
all_data = []
try:
# Read URLs from the CSV file
with open('housing_association_links.csv', mode='r', newline='', encoding='utf-8') as file:
reader = csv.reader(file)
next(reader) # Skip header
urls = [row[0] for row in reader]
for url in urls:
try:
try:
driver.get(url)
# Wait for the page to fully load
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.TAG_NAME, 'body')) # Change to a more specific element if needed
)
locate_contact()
get_contact()
except TimeoutException:
print(f"Page load timed out for URL {url}. Moving to the next URL.")
continue # Skip to the next URL in case of a timeout
except Exception as e:
print(f"Error processing URL {url}: {e}")
continue
finally:
csv_file = "housing_associations.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["Housing Association Name", "Contact Email Address", "Telephone Number"])
all_data_as_tuples = set(tuple(row) for row in all_data)
all_data = [list(row) for row in all_data_as_tuples]
writer.writerows(all_data)
driver.quit()
print('done')
ere
此代码确实有效,但我更关心如何提高效率,因为很多网站只会返回正则表达式认为是电话号码/电子邮件的随机字符串。我想知道我是否可以使用另一种网络抓取方法,然后比较两种方法的结果以获得更准确的信息。任何帮助都将不胜感激。
谢谢。