test4.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. import requests
  2. import scrapy
  3. import re
  4. import sqlite3
  5. import time
  6. # 功能:检查字符串str是否符合正则表达式re_exp
  7. # re_exp:正则表达式
  8. # str:待检查的字符串
  9. def check_string(re_exp, str):
  10. res = re.search(re_exp, str)
  11. if res:
  12. return True
  13. else:
  14. return False
  15. # 链接数据库
  16. con = sqlite3.connect("tutorial.db")
  17. cur = con.cursor()
  18. # 判读数据表是否存在
  19. def is_table(table_name):
  20. res = cur.execute("SELECT name FROM sqlite_master WHERE name='" + table_name + "'")
  21. return res.fetchone() is None
  22. # 创建文章主题表
  23. if is_table('book'):
  24. cur.execute("CREATE TABLE book(url, title, pt_next, text)")
  25. # 判断url是否存在数据库中
  26. def is_url_in_table(url):
  27. res = cur.execute("SELECT url FROM book WHERE url='" + url + "'")
  28. return res.fetchone() is None
  29. # 创建临时进度表
  30. if is_table('temp'):
  31. cur.execute("CREATE TABLE temp(url, title, pt_next)")
  32. # 向 book 表写入数据
  33. def inset_book(url, title, pt_next, text):
  34. # print(url, title, pt_next, text)
  35. cur.executemany("INSERT INTO book VALUES(?, ?, ?, ?)", [(url, title, pt_next, text)])
  36. # ("Monty Python Live at the Hollywood Bowl", "Bowl", "Bowl", "Bowl"))
  37. con.commit() # Remember to commit the transaction after executing INSERT.
  38. # cur.executemany("INSERT INTO movie book(?, ?, ?, ?)", data)
  39. def inset_temp(url, title, pt_next):
  40. cur.executemany("INSERT INTO temp VALUES(?, ?, ?)", [(url, title, pt_next)])
  41. con.commit() # Remember to commit the transaction after executing INSERT.
  42. def get_text(page_url):
  43. res = requests.get(url=page_url)
  44. html = res.text
  45. sel = scrapy.Selector(text=html)
  46. title = sel.css('span[class="title"]::text').get()
  47. pt_next = sel.css('a[id="pt_next"]').xpath('@href').get()
  48. # 获取到文字
  49. words = sel.css('div[id="chaptercontent"]::text').extract()
  50. str = ''
  51. for word in words:
  52. if (word != '\n' and word != 'm.2188c.com'):
  53. str = str + word + '\n'
  54. # print(is_url_in_table(page_url));
  55. # 存储到数据库中
  56. if is_url_in_table(page_url):
  57. inset_book(url=page_url, title=title, pt_next=pt_next, text=str);
  58. # 执行下一步
  59. # inset_temp(url=page_url, title=title, pt_next=pt_next)
  60. #
  61. # with open('data.txt', 'a') as f: # 设置文件对象
  62. # f.write(str) # 将字符串写入文件中
  63. # if ("llo" in "hello, python"):
  64. # time.sleep(1);
  65. if pt_next != '' and pt_next != '/txt/31063/':
  66. url = 'https://m.2188c.com'+pt_next
  67. # with open('line.txt', 'w') as f: # 设置文件对象
  68. # f.write(url) # 将字符串写入文件中
  69. print(title, pt_next)
  70. get_text(url)
  71. # 11649
  72. # get_text('https://m.2188c.com/txt/31063/93347948.html');
  73. # get_text('https://m.2188c.com/txt/31063/85901149.html');
  74. # get_text('https://m.2188c.com/txt/31063/82277794.html');
  75. get_text('https://m.2188c.com/txt/31063/88316053.html');