signed

QiShunwang

“诚信为本、客户至上”

python爬取新闻

2021/6/3 14:27:57   来源:
import re
import urllib
from urllib import request
from collections import deque
from bs4 import BeautifulSoup
import lxml
import sqlite3
import jieba
import math

conn=sqlite3.connect("viewsdu.db")

c=conn.cursor()
c.execute('select count(*) from doc')
N=1+c.fetchall()[0][0]
target=input("请输入搜索词")
seggen=jieba.cut_for_search(target)
#搜索内容进行分词
score={}

for word in seggen:
    print('得道查询词: ',word)
    tf={}
    c.execute('select list from word where term=?',(word,))
    result=c.fetchall()
    if len(result)>0:
        doclist=result[0][0]
        doclist=doclist.split(' ')
        doclist=[int(x) for x in doclist]
        df=len(set(doclist))
        idf=math.log(N/df)
        print('idf: ',idf)
        for num in doclist:
            if num in tf:
                tf[num]=tf[num]+1
            else:
                tf[num]=1

        for num in tf:
            if num in score:
                score[num]=score[num]+tf[num]*idf
            else:
                score[num]=tf[num]*idf

sortedlist=sorted(score.items(),key=lambda d:d[1],reverse=True)

print('得分列表',sortedlist)

cnt=0
cnt1=0
for num,docscore in sortedlist:
    cnt=cnt+1
    c.execute('select link from doc where id=?',(num,))

    url=c.fetchall()[0][0]
    print(url,'得分: ',docscore)
    try:
        respnse=request.urlopen(url)
        content=respnse.read().decode('utf-8')
    except:
        print("网页读取出错")
        continue

    soup=BeautifulSoup(content,'lxml')
    title=soup.title
    if title==None:
        print('No title.')
    else:
        title=title.text
        print(title)
    if cnt>10:
        break
if cnt==0:
    print("无搜索结果")

 

# search_engine_build-2.py(爬取并保存)
import sys
from collections import deque
import urllib
from urllib import request
import re
from bs4 import BeautifulSoup
import lxml
import sqlite3
import jieba

url = 'https://www.fjut.edu.cn/561/list.htm'  # 'http://www.zut.edu.cn'#入口

unvisited = deque()  # 待爬取链接的列表,使用广度优先搜索
visited = set()  # 已访问的链接集合
unvisited.append(url)
conn = sqlite3.connect("viewsdu.db")
c = conn.cursor()
#  在create table之前先drop table是因为我之前测试的时候已经建过table了,所以再次运行代码的时候得把旧的table删了重新建
#c.execute('drop table doc')
c.execute('create table doc (id int primary key,link text)')
# c.execute('drop table word')
c.execute('create table word (term varchar(25) primary key,list text)')
conn.commit()
conn.close()

print('***************开始!*****************************')
cnt = 0
print('开始。。。。。 ')
while unvisited:
    url = unvisited.popleft()
    visited.add(url)
    cnt += 1
    print('开始抓取第', cnt, '个链接:', url)

    #  爬取网页内容
    try:
        response = request.urlopen(url)
        content = response.read().decode('utf-8')

    except:
        continue

    #  寻找下一个可爬的链接,因为搜索范围是网站内,所以对链接有格式要求,这个格式要求根据具体情况而定

    #  解析网页内容,可能有几种情况,这个也是根据这个网站网页的具体情况写的
    soup = BeautifulSoup(content, 'lxml')
    all_a = soup.find_all('a', {'target': "_blank"})  # 本页面所有的新闻链接<a>
    for a in all_a:

        x = a.attrs['href']  # 网址
        if not re.match(r'^/', x):
            continue
        x = 'https://www.fjut.edu.cn' + x
        # print(x)
        if (x not in visited) and (x not in unvisited):
            unvisited.append(x)

    a = soup.find('a', {'class': "next"})  # 下一页<a>
    if a != None:
        x = a.attrs['href']  # 网址
        x = 'https://www.fjut.edu.cn/' + x

        if (x not in visited) and (x not in unvisited):
            unvisited.append(x)

    title = soup.title

    article = soup.find('div', class_="Article_Content")
    if article and article.find_all(re.compile("^p")):
        all_p = article.find_all(re.compile("^p"))
        article = ""
        for p in all_p:
            p_str = p.get_text("", strip=True)
            p_str = ''.join(p_str.split())
            article += p_str
        print(article)
    elif article and article.find_all(re.compile("^div")):
        all_p = article.find_all(re.compile("^div"))
        article = ""
        for p in all_p:
            p_str = p.get_text("", strip=True)
            p_str = ''.join(p_str.split())
            article += p_str
        print(article)
    else:
        article = ''

    if title == None:
        print('无内容的页面。')
        continue
    else:
        title = title.text
        title = ''.join(title.split())

    print('网页标题:', title)

    #  提取出的网页内容存在title,article字符串里,对它们进行中文分词
    seggen = jieba.cut_for_search(title)
    seglist = list(seggen)
    seggen = jieba.cut_for_search(article)
    seglist += list(seggen)

    #  数据存储
    conn = sqlite3.connect("viewsdu.db")
    c = conn.cursor()
    c.execute('insert into doc values(?,?)', (cnt, url))

    # 对每个分出的词语建立词表
    for word in seglist:
        # print(word)
        # 检验看看这个词语是否已存在于数据库
        c.execute('select list from word where term=?', (word,))
        result = c.fetchall()
        # 如果不存在
        if len(result) == 0:
            docliststr = str(cnt)
            c.execute('insert into word values(?,?)', (word, docliststr))
        # 如果已存在
        else:
            docliststr = result[0][0]  # 得到字符串
            docliststr += ' ' + str(cnt)
            c.execute('update word set list=? where term=?', (docliststr, word))

    conn.commit()
    conn.close()
print('词表建立完毕')