signed

QiShunwang

“诚信为本、客户至上”

爬取新闻1

2021/6/3 15:17:15   来源:
import sqlite3

from urllib import request
from bs4 import BeautifulSoup
from collections import deque
import re
import jieba
jieba.setLogLevel(jieba.logging.INFO)

url0 = "https://www.zut.edu.cn"
url1 = "https://www.fjut.edu.cn"
url2 = 'https://www.fjut.edu.cn/'
unvisited = deque() # 双向队列,待爬取链接的集合
visited = set()     # 集合,已访问的链接集合
unvisited.append(url1)

con = sqlite3.connect('1.db')
c = con.cursor()

c.execute('create table doc(id int primary key, link text)')

c.execute('create table word(term varchar(25) primary key, list text)')
con.commit()
con.close()
cnt = 0
while unvisited:
    # 出队列
    url = unvisited.popleft()
    visited.add(url)
    cnt += 1

    # 寻找下一个可爬取的链接
    print("开始爬取第", cnt, "个链接:", url)
    try:
        response = request.urlopen(url)
        content = response.read().decode('UTF-8')    
    except:
        continue
    soup = BeautifulSoup(content, 'lxml')
    all_a = soup.find_all('a')
    for a in all_a:
        if 'href' in a.attrs:
           x = a.attrs['href']
        #print(x)

        # 排除 以 http 开头,但不是 https://www.zut.edu.cn/ 开头的网址
        if re.match(r'http.+', x):  # .+匹配1个或多个任意字符,除了换行符
            if not re.match(r'https\:\/\/www\.zut\.edu\.cn\/.+', x):
                continue
        # 获得完整的网址
        if re.match(r'^\/.*?(htm)$', x):  # 匹配以 / 开头,以 htm 结尾
            x = url1 + x
        elif re.match(r'.*?htm$', x):    # 匹配以 htm 结尾  
            x = url2 + x
        #print(x)    # 因为无法精确定位,还会读取到很多不是新闻的网址

        # 没遍历过的网址入队列
        if (x not in visited) and (x not in unvisited):
           unvisited.append(x)

    # 爬取下一页的内容
    a = soup.find('a', {'class':"next"})
    if a != None:
        x = a.attrs['href'] 
        # 获得完整的网址
        if re.match(r'^\/.*?(htm)$', x):  # 匹配以 / 开头,以 htm 结尾
            x = url1 + x
        elif re.match(r'.*?htm$', x):    # 匹配以 htm 结尾  
            x = url2 + x
        #print(x)  
        # 没遍历过的网址入队列
        if (x not in visited) and (x not in unvisited):
           unvisited.append(x)

    # 建立倒排词表
    title = soup.title
    author = soup.select('.biaoti-info > span > span > span > span')
    if author != []:
        author = author[0]
    article = soup.select('#maintext2 > div > div > div > div')
    if article == []:
        article = soup.select('[frag = "面板2"] > div > div > div > div > div')
    if article != []:
        article = article[0]
    if title == None and  author == [] and article == []:
        print('无内容的页面')
    elif author == [] and article == []:
        print('只有标题')   # 应该是目录之类的
        title = title.text
        title = ''.join(title.split())  
        author = ""
        article = ""
    elif article == []:
        print('有标题有作者,缺失内容')
        title = title.text
        title = ''.join(title.split()) 
        author = author.get_text("", strip = True)  
        article = ""
    elif author == []:
        print('有标题有内容,缺失作者')
        title = title.text
        title = ''.join(title.split()) 
        author = ""
        article = article.get_text("", strip = True)
        article = "".join(article.split())
    else:
        title = title.text
        title = ''.join(title.split())          # 以空字符串 连接title的字符(即,去掉中间空格)
        author = author.get_text("", strip = True)  # 去掉前后空格
        article = article.get_text("", strip = True)
        article = "".join(article.split())
    #print("title: ", title,  "\nauthor: " , author, "\narticle: ", article)
    print("title: ", title)

    # 中文分词
    seggen = jieba.cut_for_search(title)
    seglist = list(seggen)
    seggen = jieba.cut_for_search(article)
    seglist += list(seggen)
    seggen = jieba.cut_for_search(author)
    seglist += list(seggen)

    # 数据存储
    con = sqlite3.connect("1.db")
    c = con.cursor()
    c.execute('insert into doc values(?, ?)', (cnt, url))

    # cnt:word 出现在第几个网址中
    for word in seglist:
        # print(word)
        c.execute('select list from word where term = ?', (word, ))
        result = c.fetchall()
        if len(result) == 0:        # 如果该词语不存在数据库中,直接插入
            docliststr = str(cnt)
            c.execute('insert into word values(?, ?)', (word, docliststr))
        else:                       # 如果存在,则在原来的基础上,再记录当前的 cnt
            docliststr = result[0][0] 
            docliststr += ' ' + str(cnt)
            c.execute('update word set list = ? where term = ?', (docliststr, word))
    con.commit()
    con.close()
print('词表建立完毕=======================================================')