123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- import requests
- import scrapy
- import re
- import sqlite3
- import time
- # 链接数据库
- con = sqlite3.connect("轮回乐园.db")
- cur = con.cursor()
- # 合并数据
- def dowload():
- cur.execute("SELECT * FROM book");
- row = cur.fetchone()
- while row is not None:
- row = cur.fetchone()
- print(row[0])
- str = row[1] + row[3] + '\n'
- with open('data.txt', 'a') as f: # 设置文件对象
- f.write(str) # 将字符串写入文件中
- time.sleep(2);
- # dowload()
- # 判读数据表是否存在
- def is_table(table_name):
- res = cur.execute("SELECT name FROM sqlite_master WHERE name='" + table_name + "'")
- return res.fetchone() is None
- # 创建文章主题表
- if is_table('new_book'):
- cur.execute("CREATE TABLE new_book(url, title, pt_next)")
- # 获取批量数据
- def get_more_data(title):
- res = cur.execute("SELECT * FROM book WHERE title='" + title + "'")
- return res.fetchall()
- # 链接数据库
- con = sqlite3.connect("轮回乐园.db")
- cur = con.cursor()
- # 记录已经处理过的数据
- def inset_new_book(url, title, pt_next):
- cur.executemany("INSERT INTO new_book VALUES(?, ?, ?)", [(url, title, pt_next)])
- con.commit() # Remember to commit the transaction after executing INSERT.
- # 判断当前title是否存在数据库中
- def is_title_in_table(title):
- res = cur.execute("SELECT title FROM new_book WHERE title='" + title + "'")
- return res.fetchone() is None
- # 过滤数据
- def clear_data():
- cur.execute("SELECT * FROM book");
- row_all = cur.fetchall()
- for row in row_all:
- url = row[0]
- title = row[1]
- pt_next = row[2]
- print(title);
- if is_title_in_table(title):
- all_row = get_more_data(title)
- centent = ''
- for text in all_row:
- centent = centent + text[3] + '\n'
- str = title + '\n' + centent;
- with open('data.txt', 'a') as f: # 设置文件对象
- f.write(str) # 将字符串写入文件中
- inset_new_book(url=url, title=title, pt_next=pt_next)
- clear_data()
|