通过ai获取初步代码

根据文档学完代码后总会感觉有些不知道从何开始，所以这次尝试根据代码学代码，一边改一边学。而初步的代码，打算从ai那里获得。

prompt:
我需要从网址“http://www.xiaoyuanzhaopin.net/bjsyxx/”获取信息，请你为我设计一个爬虫。这个网页的结构如下：
该网页中具有多个<table>标签，每个<table>标签有子元素<tbody>，<tbody>的子元素是数个<tr>,每个<tr>有一个子元素<td>,<td>标签中有两个子<a>标签。我需要的内容在第二个<a>标签跳转到的页面中。
跳转到的页面有两处含有我需要的信息：
第一处：一个class为“entry-header”的<header>标签，其中有一个class为”entry-title”的<h1>标签，我需要这个<h1>标签内的text。
第二处：一个class为”entry-content”的<div>标签，其子元素是多个<p>标签以及一个<figure>标签。<p>标签中要么仅含有text,要么含有一个<a>标签。如果仅含有text，则提取text；如果是<a>标签，则同时保留<a>标签中的text以及链接。<figure>标签中含有一个<tbody>，<tbody>中含有多个<tr>，每个<tr>中有一个<td>,每个<td>中有一个<a>，我需要<a>中的text以及链接。
最终以json格式输出，json中键值说明如下：
1.title: 值为<h1>标签中的text,类型为string
2.content: 值为<p>标签中的内容，类型为{title2 : string, src : string}，title2为<p>标签或其中<a>标签的text, src为<a>标签跳转的链接，如过没有a标签则为””。
3.addContent:值为<figure>中的<a>的内容，类型为{title3 : string, src : string}。title3为<a>标签的text, src为<a>标签跳转的链接
如果你还有什么疑问，请提出，如果没有，请帮我生成相应代码。

AI的回复如下

import requests
from bs4 import BeautifulSoup
import json

# 定义函数来提取目标信息
def extract_info(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"无法访问网页: {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # 提取标题信息
    title = soup.find('h1', class_='entry-title').text.strip()

    # 提取内容信息
    content = []
    content_div = soup.find('div', class_='entry-content')
    paragraphs = content_div.find_all('p')

    for paragraph in paragraphs:
        if paragraph.a:
            content.append({
                'title2': paragraph.a.text.strip(),
                'src': paragraph.a['href']
            })
        else:
            content.append({
                'title2': paragraph.text.strip(),
                'src': ''
            })

    # 提取附加内容信息
    add_content = []
    figure = content_div.find('figure')
    if figure:
        figure_links = figure.find_all('a')
        for link in figure_links:
            add_content.append({
                'title3': link.text.strip(),
                'src': link['href']
            })

    # 将结果以JSON格式输出
    result = {
        'title': title,
        'content': content,
        'addContent': add_content
    }

    return result

# 获取主页面中的所有链接
main_url = "http://www.xiaoyuanzhaopin.net/bjsyxx/"
main_response = requests.get(main_url)
main_soup = BeautifulSoup(main_response.text, 'html.parser')
main_links = main_soup.find_all('a')

# 存储所有网页的信息
all_results = []

# 遍历每个链接并提取信息
for link in main_links:
    link_url = link.get('href')
    if link_url and link_url.startswith('http'):
        result = extract_info(link_url)
        if result:
            all_results.append(result)

# 将所有结果以JSON格式输出
output_json = json.dumps(all_results, ensure_ascii=False, indent=4)
print(output_json)

gpt确实很厉害，大概的轮廓一下子就出来了。

根据遇到的问题修改代码

在尝试运行的过程中，发现了一系列的问题，但从 gpt生成的初步代码已然能看出函数的用法，于是尝试照着规律修改代码，并且再次询问了一次gpt，使结果会输出到一个excel的表格中。最终代码如下：

import requests
import openpyxl
from bs4 import BeautifulSoup
import json
import time
from openpyxl.styles import Font, Alignment, Border, Side
from openpyxl.utils import get_column_letter

# 定义函数来提取目标信息
def extract_info(url,position):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"无法访问网页: {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # 提取标题信息
    title = soup.find('h1', class_='entry-title').text.strip()

    # 提取内容信息
    content = []
    content_div = soup.find('div', class_='entry-content')
    paragraphs = content_div.find_all('p')
    for paragraph in paragraphs:
        if paragraph.a:
            links = paragraph.find_all('a')
            for link in links:
                content.append({
                    'title': link.text.strip(),
                    'src': link['href']
                })
            continue
        if paragraph.img:
            imgs = paragraph.find_all('img')
            i = 1
            for img in imgs:
                content.append({
                    'title': '图片'+ str(i) +'-'+title,
                    'src': img['src']
                })
                i=i+1
            continue
        else:
            content.append({
                'title': paragraph.text.strip(),
                'src': ''
            })
    ulists = content_div.find('ul')
    if(ulists):
        ulistsrcs = ulists.find_all('a')
        for ulistsrc in ulistsrcs:
            content.append({
                'title': ulistsrc.text.strip(),
                'src': ulistsrc['href']
            })
    olists = content_div.find('ol')
    if(olists):
        olistsrcs = olists.find_all('a')
        for olistsrc in olistsrcs:
            content.append({
                'title': olistsrc.text.strip(),
                'src': olistsrc['href']
            })
    # 提取附加内容信息
    figures = content_div.find_all('figure')
    if figures:
        for figure in figures:
            figure_links = figure.find_all('a')
            len_links = len(figure_links)
            for i in range(len_links - 1):
                content.append({
                    'title': figure_links[i].text.strip(),
                    'src': figure_links[i]['href']
                })

    # 将结果以JSON格式输出
    if len(content)!= 0:
        result = {
            'title': title,
            'content': content,
            'position':position,
        }
        print(result)
        return result
    errschool.append(title)
    return

# 存储所有网页的信息
all_results = []
# 储存出现问题的学校
errschool = []

def creat_excel(data,location):
    # 创建一个新的Excel工作簿
    workbook = openpyxl.Workbook()

    # 创建一个新的工作表
    worksheet = workbook.active
    worksheet.title = "Data"

    # 定义样式
    title_font = Font(size=12, bold=True)
    title_alignment = Alignment(vertical='center')
    url_font = Font(color="0000FF", underline="single")
    border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'),
                    bottom=Side(style='thin'))

    # 写入表头
    worksheet['A1'] = "学校"
    worksheet['B1'] = "位置"
    worksheet['C1'] = "内容"
    worksheet['D1'] = "链接"

    #设置样式
    for col_letter in ['A', 'B', 'C','D']:
        cell = worksheet[f'{col_letter}1']
        cell.font = title_font
        cell.alignment = title_alignment

    # 写入数据
    row_num = 2
    for item in data:
        title = item['title']
        position = item['position']

        # 获取相同标题的结束行号
        end_row = row_num + len(item['content']) - 1

        for content_item in item['content']:
            content_title = content_item['title']
            content_src = content_item['src']

            worksheet[f'A{row_num}'] = title
            worksheet[f'B{row_num}'] = position
            worksheet[f'C{row_num}'] = content_title
            worksheet[f'D{row_num}'] = content_src

            # 设置单元格样式
            for col_letter in ['C', 'D']:
                cell = worksheet[f'{col_letter}{row_num}']
                cell.border = border

            for col_letter in ['A', 'B']:
                cell = worksheet[f'{col_letter}{row_num}']
                cell.border = border
                cell.alignment = title_alignment
                cell.font = title_font
            # 设置链接样式
            worksheet[f'D{row_num}'].font = url_font
            worksheet[f'D{row_num}'].hyperlink = content_src

            row_num += 1

        # 合并相同标题的单元格
        worksheet.merge_cells(f'A{row_num - len(item["content"])}:A{end_row}')
        worksheet.merge_cells(f'B{row_num - len(item["content"])}:B{end_row}')
        row_num += 1  # 换行

    for column in worksheet.columns:
        max_length = 0
        column = [cell for cell in column]
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(cell.value)
            except:
                pass
        adjusted_width = (max_length + 2)
        worksheet.column_dimensions[get_column_letter(column[0].column)].width = adjusted_width
    worksheet.column_dimensions['A'].width = 32
    worksheet.column_dimensions['B'].width = 9
    worksheet.column_dimensions['C'].width = 60
    # 指定输出的Excel文件路径
    output_excel_path = f'./excelInfo/{location}.xlsx'
    # 保存Excel文件
    workbook.save(output_excel_path)
    print(f"数据已成功写入 {output_excel_path}")

def creat_json(data,location):
    # 将数据写入 JSON 文件
    output_file_path = f'./jsonInfo/{location}.json'
    with open(output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)
    print(f"数据已成功写入 {output_file_path}")

# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
    # result = extract_info('http://www.xiaoyuanzhaopin.net/tsinghua-syxx/')
    main_url = "http://www.xiaoyuanzhaopin.net/fjsyxx/"
    location = '福建'
    main_response = requests.get(main_url)
    main_soup = BeautifulSoup(main_response.text, 'html.parser')
    main_content = main_soup.find('div',class_='entry-content')
    tables = main_content.find_all('table')
    main_links=[]
    for table in tables:
        # 查找<td>标签
        td_elements = table.find_all('td')
        # 提取<td>标签中的<a>标签
        for td in td_elements:
            a_elements = td.find_all('a')
            if len(a_elements)==2:
                link = {
                    'position': a_elements[0].text.strip(),
                    'text': a_elements[1].text.strip(),
                    'href': a_elements[1]['href']
                }
                main_links.append(link)
    for link in main_links:
        time.sleep(0.7)
        link_url = link.get('href')
        result = extract_info(link_url,link.get('position'))
        if result:
            all_results.append(result)

    creat_json(all_results,location)
    creat_excel(all_results,location)
    print(len(all_results))
    print(errschool)

从代码中学习

发现通过代码实例来学习的效率确实要高很多，一下子就学会beautifulSoup和openpyxl的基础用法了。

beautifulSoup

获取页面的beautifulSoup对象


import requests
from bs4 import BeautifulSoup

response = requests.get(url)
    if response.status_code != 200:
        print(f"无法访问网页: {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

根据需要获取标签

soup.find(‘标签名’, [class_=’类名’])	返回符合的第一个元素，元素仍然是soup对象
soup.find_all(‘标签名’, [class_=’类名’])	获取符合的所有元素，返回数组。数组中的每一个元素仍是soup对象

获取标签的属性值

soup.text.strip()	获取标签内部的文本
soup[’src’]	获取标签的src属性值
soup[’herf’]	获取标签的herf值

爬取页面需要的东西好像就这些也就够了。

openpyxl

创建一个新的Excel对象

1	workbook = openpyxl.Workbook()

创建一页新的sheet

1 2	worksheet = workbook.active worksheet.title = "sheet名"

在特定的位置写入信息

1
2
3

worksheet['A1'] = "学校"
worksheet['B1'] = "内容"
worksheet['C1'] = "链接"

合并需要合并的单元格

1 2	# merge_cells("左上角单位格：右下角单位格") worksheet.merge_cells(f'A{row_num - len(item["content"])}:A{end_row}')

设置样式

# 定义样式
    title_font = Font(size=12, bold=True)
    title_alignment = Alignment(vertical='center')
    url_font = Font(color="0000FF", underline="single")
    border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'),bottom=Side(style='thin'))

#设置样式
		cell = worksheet[f'A{row_num}']
    cell.border = border
    cell.alignment = title_alignment
    cell.font = title_font
# 设置链接样式
    worksheet[f'C{row_num}'].font = url_font
    worksheet[f'C{row_num}'].hyperlink = content_src

输出

# 指定输出的Excel文件路径
  output_excel_path = 'output.xlsx'

  # 保存Excel文件
  workbook.save(output_excel_path)

  print(f"数据已成功写入 {output_excel_path}")

只要掌握这些函数，把爬取的内容放到excel里就也没什么问题了

发现通过ai生成基础代码来学习的效率真的很高，而且学习的过程也有趣许多，很容易看到学到的每一点东西的作用，不过似乎有点功利了。但先功利着吧，有意思毕竟也很重要呀！😉