python
/
python-reptile-novels


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
							import requests
import scrapy
import re
import sqlite3

import time


# 功能：检查字符串str是否符合正则表达式re_exp
# re_exp:正则表达式
# str:待检查的字符串
def check_string(re_exp, str):
    res = re.search(re_exp, str)
    if res:
        return True
    else:
        return False


# 链接数据库
con = sqlite3.connect("tutorial.db")
cur = con.cursor()


# 判读数据表是否存在
def is_table(table_name):
    res = cur.execute("SELECT name FROM sqlite_master WHERE name='" + table_name + "'")
    return res.fetchone() is None


# 创建文章主题表
if is_table('book'):
    cur.execute("CREATE TABLE book(url, title, pt_next, text)")


# 判断url是否存在数据库中
def is_url_in_table(url):
    res = cur.execute("SELECT url FROM book WHERE url='" + url + "'")
    return res.fetchone() is None


# 创建临时进度表
if is_table('temp'):
    cur.execute("CREATE TABLE temp(url, title, pt_next)")


# 向 book 表写入数据
def inset_book(url, title, pt_next, text):
    # print(url, title, pt_next, text)
    cur.executemany("INSERT INTO book VALUES(?, ?, ?, ?)", [(url, title, pt_next, text)])
    # ("Monty Python Live at the Hollywood Bowl", "Bowl", "Bowl", "Bowl"))
    con.commit()  # Remember to commit the transaction after executing INSERT.


# cur.executemany("INSERT INTO movie book(?, ?, ?, ?)", data)

def inset_temp(url, title, pt_next):
    cur.executemany("INSERT INTO temp VALUES(?, ?, ?)", [(url, title, pt_next)])
    con.commit()  # Remember to commit the transaction after executing INSERT.


def get_text(page_url):
    res = requests.get(url=page_url)
    html = res.text

    sel = scrapy.Selector(text=html)
    title = sel.css('span[class="title"]::text').get()
    pt_next = sel.css('a[id="pt_next"]').xpath('@href').get()

    # 获取到文字
    words = sel.css('div[id="chaptercontent"]::text').extract()
    str = ''
    for word in words:
        if (word != '\n' and word != 'm.2188c.com'):
            str = str + word + '\n'
    # print(is_url_in_table(page_url));
    # 存储到数据库中
    if is_url_in_table(page_url):
        inset_book(url=page_url, title=title, pt_next=pt_next, text=str);
    # 执行下一步
    # inset_temp(url=page_url, title=title, pt_next=pt_next)
    #
    # with open('data.txt', 'a') as f:  # 设置文件对象
    #     f.write(str)  # 将字符串写入文件中
    # if ("llo" in "hello, python"):
    # time.sleep(1);
    if pt_next != '' and pt_next != '/txt/31063/':
        url = 'https://m.2188c.com'+pt_next
        # with open('line.txt', 'w') as f:  # 设置文件对象
        #     f.write(url)  # 将字符串写入文件中
        print(title, pt_next)
        get_text(url)

# 11649

# get_text('https://m.2188c.com/txt/31063/93347948.html');
# get_text('https://m.2188c.com/txt/31063/85901149.html');
# get_text('https://m.2188c.com/txt/31063/82277794.html');
get_text('https://m.2188c.com/txt/31063/88316053.html');