8wDlpd.png
8wDFp9.png
8wDEOx.png
8wDMfH.png
8wDKte.png

有关网页抓取技术的建议以及如何实现联系信息

askiiart.net 3月前

27 0

我编写了一个网页抓取脚本,使用正则表达式和 selenium 来查找房屋公司的联系方式,我没有使用其他网页抓取工具的经验,并且我愿意接受所有的批评和建议......

我制作了一个网页抓取脚本,使用正则表达式和 selenium 来查找房屋公司的联系方式,我没有使用其他网页抓取工具的经验,并且我愿意接受所有的批评和建议。

我需要帮助以更一致地找到正确的详细信息,此外如果有人有适用于英国电话号码的正则表达式。

type hfrom selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import re
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import os
from selenium.common.exceptions import TimeoutException 

# Path to the extension .crx file
extension_path = 'xxx'
chromedriver_path = '/usr/local/bin/chromedriver'

# Verify the extension file exists
if not os.path.exists(extension_path):
    raise FileNotFoundError(f"The extension file at {extension_path} does not exist.")

# Set up Chrome options
chrome_options = Options()
chrome_options.add_extension(extension_path)

# Initialize ChromeDriver with the extension
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Set page load timeout
driver.set_page_load_timeout(20)

# Regular expressions
phone_regex = re.compile(
    r'\+44\s?\d{4}\s?\d{6}|\(?0\d{4}\)?\d{3}\s?\d{3}|\(?0\d{3}\)?\d{3}\s?\d{4}|\(?0\d{2}\)?\d{4}\s?\d{4}'
)
email_regex = re.compile(
    r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
)

def locate_contact():
    try:

        contact_element = driver.find_element(By.XPATH, "//*[contains(text(), 'Contact us') or contains(text(), 'Contact') or contains(text(), 'Get in touch')]")
        print("Contact us link found")
    except Exception as e:
        print(f"Couldn't find a contact page: {e}")
        return

def get_contact():
    page_source = driver.page_source

    phone_numbers = phone_regex.findall(page_source)
    emails = email_regex.findall(page_source)

    phone_numbers = [num for num in phone_numbers if len(num) > 6 and len(num) < 15]
    phone_numbers = phone_numbers[:3]
    phone_numbers = list(set(phone_numbers))

    emails = emails[:3]
    emails = list(set(emails))

    main_page_source = driver.page_source
    phone_numbers, emails = retry_missing_details(driver, phone_numbers, emails, main_page_source)

    housing_name = driver.title[:53]
    housing_name = housing_name.replace(',', '|')
    housing_name = housing_name.replace('-', '|')

    print(housing_name)
    print(f"Found emails: {emails}")
    print(f"Found phone numbers: {phone_numbers}")
    print('----------------------------------------------')

    all_data.append([housing_name, ', '.join(emails), ', '.join(phone_numbers)])


def retry_missing_details(driver, phone_numbers, emails, main_page_source):
    if not phone_numbers or not emails:
        print("Retrying main page for missing details")
        new_phone_numbers = phone_regex.findall(main_page_source)
        new_emails = email_regex.findall(main_page_source)

        new_phone_numbers = [num for num in new_phone_numbers if len(num) > 6 and len(num) < 15]
        new_phone_numbers = list(set(new_phone_numbers))

        new_emails = list(set(new_emails))

        if not phone_numbers:
            phone_numbers = new_phone_numbers[:3]
        if not emails:
            emails = new_emails[:3]

    return phone_numbers, emails

# List to store all data
all_data = []

try:
    # Read URLs from the CSV file
    with open('housing_association_links.csv', mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header 
        urls = [row[0] for row in reader]


    for url in urls:
        try:
            try:
                driver.get(url)

                # Wait for the page to fully load 
                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.TAG_NAME, 'body'))  # Change to a more specific element if needed
                )

                locate_contact()
                get_contact()

            except TimeoutException:
                print(f"Page load timed out for URL {url}. Moving to the next URL.")
                continue  # Skip to the next URL in case of a timeout

        except Exception as e:
            print(f"Error processing URL {url}: {e}")
            continue


finally:
    csv_file = "housing_associations.csv"
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Housing Association Name", "Contact Email Address", "Telephone Number"])

        all_data_as_tuples = set(tuple(row) for row in all_data)
        all_data = [list(row) for row in all_data_as_tuples]

        writer.writerows(all_data)

    driver.quit()

print('done')

ere

此代码确实有效,但我更关心如何提高效率,因为很多网站只会返回正则表达式认为是电话号码/电子邮件的随机字符串。我想知道我是否可以使用另一种网络抓取方法,然后比较两种方法的结果以获得更准确的信息。任何帮助都将不胜感激。

谢谢。

帖子版权声明 1、本帖标题:有关网页抓取技术的建议以及如何实现联系信息
    本站网址:http://xjnalaquan.com/
2、本网站的资源部分来源于网络,如有侵权,请联系站长进行删除处理。
3、会员发帖仅代表会员个人观点,并不代表本站赞同其观点和对其真实性负责。
4、本站一律禁止以任何方式发布或转载任何违法的相关信息,访客发现请向站长举报
5、站长邮箱:yeweds@126.com 除非注明,本帖由askiiart.net在本站《python》版块原创发布, 转载请注明出处!
最新回复 (0)
返回
作者最近主题: