import requests import scrapy import re import sqlite3 import time # 功能:检查字符串str是否符合正则表达式re_exp # re_exp:正则表达式 # str:待检查的字符串 def check_string(re_exp, str): res = re.search(re_exp, str) if res: return True else: return False # 链接数据库 con = sqlite3.connect("轮回乐园.db") cur = con.cursor() # 判读数据表是否存在 def is_table(table_name): res = cur.execute("SELECT name FROM sqlite_master WHERE name='" + table_name + "'") return res.fetchone() is None # 创建文章主题表 if is_table('book'): cur.execute("CREATE TABLE book(url, title, pt_next, text)") # 判断url是否存在数据库中 def is_url_in_table(url): res = cur.execute("SELECT url FROM book WHERE url='" + url + "'") return res.fetchone() is None # 创建临时进度表 if is_table('temp'): cur.execute("CREATE TABLE temp(url, title, pt_next)") # 向 book 表写入数据 def inset_book(url, title, pt_next, text): # print(url, title, pt_next, text) cur.executemany("INSERT INTO book VALUES(?, ?, ?, ?)", [(url, title, pt_next, text)]) # ("Monty Python Live at the Hollywood Bowl", "Bowl", "Bowl", "Bowl")) con.commit() # Remember to commit the transaction after executing INSERT. # cur.executemany("INSERT INTO movie book(?, ?, ?, ?)", data) def inset_temp(url, title, pt_next): cur.executemany("INSERT INTO temp VALUES(?, ?, ?)", [(url, title, pt_next)]) con.commit() # Remember to commit the transaction after executing INSERT. def get_text(page_url): res = requests.get(url=page_url) html = res.text sel = scrapy.Selector(text=html) title = sel.css('span[class="title"]::text').get() pt_next = sel.css('a[id="pt_next"]').xpath('@href').get() # 获取到文字 words = sel.css('div[id="chaptercontent"]::text').extract() str = '' for word in words: if (word != '\n' and word != 'm.2188c.com'): str = str + word + '\n' # print(is_url_in_table(page_url)); # 存储到数据库中 if is_url_in_table(page_url): inset_book(url=page_url, title=title, pt_next=pt_next, text=str); # 执行下一步 # inset_temp(url=page_url, title=title, pt_next=pt_next) # # with open('data.txt', 'a') as f: # 设置文件对象 # f.write(str) # 将字符串写入文件中 # if ("llo" in "hello, python"): # time.sleep(1); if pt_next != '' and pt_next != '/txt/31063/': url = 'https://m.2188c.com'+pt_next # with open('line.txt', 'w') as f: # 设置文件对象 # f.write(url) # 将字符串写入文件中 print(title, pt_next) get_text(url) # 11649 # get_text('https://m.2188c.com/txt/31063/93347948.html'); # get_text('https://m.2188c.com/txt/31063/85901149.html'); # get_text('https://m.2188c.com/txt/31063/82277794.html'); get_text('https://m.2188c.com/txt/31063/88316053.html');