123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- import requests
- import scrapy
- import re
- import sqlite3
- import time
- # 功能:检查字符串str是否符合正则表达式re_exp
- # re_exp:正则表达式
- # str:待检查的字符串
- def check_string(re_exp, str):
- res = re.search(re_exp, str)
- if res:
- return True
- else:
- return False
- # 链接数据库
- con = sqlite3.connect("tutorial.db")
- cur = con.cursor()
- # 判读数据表是否存在
- def is_table(table_name):
- res = cur.execute("SELECT name FROM sqlite_master WHERE name='" + table_name + "'")
- return res.fetchone() is None
- # 创建文章主题表
- if is_table('book'):
- cur.execute("CREATE TABLE book(url, title, pt_next, text)")
- # 判断url是否存在数据库中
- def is_url_in_table(url):
- res = cur.execute("SELECT url FROM book WHERE url='" + url + "'")
- return res.fetchone() is None
- # 创建临时进度表
- if is_table('temp'):
- cur.execute("CREATE TABLE temp(url, title, pt_next)")
- # 向 book 表写入数据
- def inset_book(url, title, pt_next, text):
- # print(url, title, pt_next, text)
- cur.executemany("INSERT INTO book VALUES(?, ?, ?, ?)", [(url, title, pt_next, text)])
- # ("Monty Python Live at the Hollywood Bowl", "Bowl", "Bowl", "Bowl"))
- con.commit() # Remember to commit the transaction after executing INSERT.
- # cur.executemany("INSERT INTO movie book(?, ?, ?, ?)", data)
- def inset_temp(url, title, pt_next):
- cur.executemany("INSERT INTO temp VALUES(?, ?, ?)", [(url, title, pt_next)])
- con.commit() # Remember to commit the transaction after executing INSERT.
- def get_text(page_url):
- res = requests.get(url=page_url)
- html = res.text
- sel = scrapy.Selector(text=html)
- title = sel.css('span[class="title"]::text').get()
- pt_next = sel.css('a[id="pt_next"]').xpath('@href').get()
- # 获取到文字
- words = sel.css('div[id="chaptercontent"]::text').extract()
- str = ''
- for word in words:
- if (word != '\n' and word != 'm.2188c.com'):
- str = str + word + '\n'
- # print(is_url_in_table(page_url));
- # 存储到数据库中
- if is_url_in_table(page_url):
- inset_book(url=page_url, title=title, pt_next=pt_next, text=str);
- # 执行下一步
- # inset_temp(url=page_url, title=title, pt_next=pt_next)
- #
- # with open('data.txt', 'a') as f: # 设置文件对象
- # f.write(str) # 将字符串写入文件中
- # if ("llo" in "hello, python"):
- # time.sleep(1);
- if pt_next != '' and pt_next != '/txt/31063/':
- url = 'https://m.2188c.com'+pt_next
- # with open('line.txt', 'w') as f: # 设置文件对象
- # f.write(url) # 将字符串写入文件中
- print(title, pt_next)
- get_text(url)
- # 11649
- # get_text('https://m.2188c.com/txt/31063/93347948.html');
- # get_text('https://m.2188c.com/txt/31063/85901149.html');
- # get_text('https://m.2188c.com/txt/31063/82277794.html');
- get_text('https://m.2188c.com/txt/31063/88316053.html');
|