284 lines
8.8 KiB

  1. #coding=utf-8
  2. #!/usr/bin/python
  3. import sys
  4. sys.path.append('..')
  5. from base.spider import Spider
  6. import re
  7. from urllib import request, parse
  8. import urllib
  9. import urllib.request
  10. import json
  11. class Spider(Spider): # 元类 默认的元类 type
  12. def getName(self):
  13. return "卡通站(kt30)"
  14. def init(self,extend=""):
  15. pass
  16. def isVideoFormat(self,url):
  17. pass
  18. def manualVideoCheck(self):
  19. pass
  20. def homeContent(self,filter):
  21. result = {}
  22. cateManual = {
  23. "日本动漫": "r",
  24. "国产动漫": "g",
  25. "港台动漫": "gm",
  26. "动画电影": "v",
  27. "欧美动漫": "o"
  28. }
  29. classes = []
  30. for k in cateManual:
  31. classes.append({
  32. 'type_name': k,
  33. 'type_id': cateManual[k]
  34. })
  35. result['class'] = classes
  36. if (filter):
  37. result['filters'] = self.config['filter']
  38. return result
  39. def homeVideoContent(self):
  40. htmlTxt = self.webReadFile(urlStr="http://kt30.com/",header=self.header)
  41. videos = self.get_list(html=htmlTxt,patternTxt=r'a class="stui-vodlist__thumb lazyload" href="(?P<url>.+?)" title="(?P<title>.+?)" data-original="(?P<img>.+?)".+?"><span class="play hidden-xs"></span><span class="pic-text text-right">(?P<renew>.+?)</span></a>')
  42. result = {
  43. 'list': videos
  44. }
  45. return result
  46. def categoryContent(self,tid,pg,filter,extend):
  47. result = {}
  48. year='0'#年份
  49. types='0'#类型
  50. area='all'#地区
  51. url = 'http://kt30.com/{0}/index_{1}.html'.format(tid,pg)
  52. htmlTxt=self.webReadFile(urlStr=url,header=self.header)
  53. videos=[]
  54. videos = self.get_list(html=htmlTxt,patternTxt=r'<a class="stui-vodlist__thumb lazyload" href="(?P<url>.+?)" title="(?P<title>.+?)" data-original="(?P<img>.+?)".+?"><span class="play hidden-xs"></span><span class="pic-text text-right">(?P<renew>.+?)</span></a>')
  55. numvL = len(videos)
  56. result['list'] = videos
  57. result['page'] = pg
  58. result['pagecount'] = pg if numvL<17 else 9999
  59. result['limit'] = numvL
  60. result['total'] = numvL
  61. return result
  62. def detailContent(self,array):
  63. aid = array[0].split('###')
  64. idUrl=aid[1]
  65. title=aid[0]
  66. pic=aid[2]
  67. playFrom = []
  68. vodItems = []
  69. videoList=[]
  70. htmlTxt = self.webReadFile(urlStr=idUrl,header=self.header)
  71. if len(htmlTxt)<5:
  72. return {'list': []}
  73. line=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'</span><h3 class="title">(.+?)</h3></div>',Index=1)
  74. playFrom=[self.removeHtml(txt=vod) for vod in line]
  75. if len(line)<1:
  76. return {'list': []}
  77. circuit=self.get_lineList(Txt=htmlTxt,mark='<ul class="stui-content__playlist',after='</ul>')
  78. # print(circuit[0])
  79. # return
  80. for vod in circuit:
  81. vodItems = self.get_EpisodesList(html=vod,RegexText=r'<a href="(?P<url>.+?)">(?P<title>.+?)</a>')
  82. joinStr = "#".join(vodItems)
  83. videoList.append(joinStr)
  84. temporary=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a href="/vodsearch/----%|\w+?---------.html" target="_blank">(.+?)</a>',Index=1)
  85. typeName="/".join(temporary)
  86. year=self.get_RegexGetText(Text=htmlTxt,RegexText=r'<a href="/vodsearch/-------------\d{4}.html" target="_blank">(\d{4})</a>',Index=1)
  87. temporary=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a href="/vodsearch/-.+?------------.html" target="_blank">(.+?)</a>',Index=1)
  88. act="/".join(temporary)
  89. temporary=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a href="/vodsearch/-----%+?|\w+?--------.html" target="_blank">(.+?)</a>',Index=1)
  90. dir="/".join(temporary)
  91. area=self.get_RegexGetText(Text=htmlTxt,RegexText=r'地区:</b>(.*?)<b>',Index=1)
  92. #area=self.get_RegexGetText(Text=htmlTxt,RegexText=r'>语言:\s{0,4}(.*?)</p>',Index=1)
  93. cont=self.get_RegexGetText(Text=htmlTxt,RegexText=r'简介:(.+?)<a href="#desc">详情',Index=1)
  94. vod = {
  95. "vod_id": array[0],
  96. "vod_name": title,
  97. "vod_pic": pic,
  98. "type_name": self.removeHtml(txt=typeName),
  99. "vod_year": year,
  100. "vod_area": self.removeHtml(txt=area),
  101. "vod_remarks": "",
  102. "vod_actor": self.removeHtml(txt=act),
  103. "vod_director": self.removeHtml(txt=dir),
  104. "vod_content": self.removeHtml(txt=cont)
  105. }
  106. vod['vod_play_from'] = '$$$'.join(playFrom)
  107. vod['vod_play_url'] = "$$$".join(videoList)
  108. result = {
  109. 'list': [
  110. vod
  111. ]
  112. }
  113. return result
  114. def verifyCode(self):
  115. pass
  116. def searchContent(self,key,quick):
  117. Url='http://kt30.com/vodsearch/-------------.html?wd={0}'.format(urllib.parse.quote(key))
  118. htmlTxt = self.webReadFile(urlStr=Url,header=self.header)
  119. videos = self.get_list(html=htmlTxt,patternTxt=r'<a class="v-thumb stui-vodlist__thumb lazyload" href="(?P<url>.+?)" title="(?P<title>.+?)" data-original="(?P<img>.+?)".+?</span><span class="pic-text text-right">(?P<renew>.+?)</span></a>')
  120. result = {
  121. 'list': videos
  122. }
  123. return result
  124. def playerContent(self,flag,id,vipFlags):
  125. result = {}
  126. parse=1
  127. jx=0
  128. url=id
  129. htmlTxt=self.webReadFile(urlStr=url,header=self.header)
  130. temporary=self.get_lineList(Txt=htmlTxt,mark=r'var player_aaaa=',after='</script>')
  131. if len(temporary)>0:
  132. jRoot=json.loads(temporary[0][16:])
  133. url=jRoot['url']
  134. if len(url)<5:
  135. url=id
  136. else:
  137. parse=0
  138. result["parse"] = parse#1=嗅探,0=播放
  139. result["playUrl"] = ''
  140. result["url"] = url
  141. result['jx'] = jx#1=VIP解析,0=不解析
  142. result["header"] = ''
  143. return result
  144. config = {
  145. "player": {},
  146. "filter": {}
  147. }
  148. header = {
  149. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
  150. 'Host': 'kt30.com',
  151. "Referer": "http://kt30.com/"
  152. }
  153. def localProxy(self,param):
  154. return [200, "video/MP2T", action, ""]
  155. #-----------------------------------------------自定义函数-----------------------------------------------
  156. #访问网页
  157. def webReadFile(self,urlStr,header):
  158. html=''
  159. req=urllib.request.Request(url=urlStr,headers=header)#,headers=header
  160. with urllib.request.urlopen(req) as response:
  161. html = response.read().decode('utf-8')
  162. return html
  163. #正则取文本
  164. def get_RegexGetText(self,Text,RegexText,Index):
  165. returnTxt=""
  166. Regex=re.search(RegexText, Text, re.M|re.S)
  167. if Regex is None:
  168. returnTxt=""
  169. else:
  170. returnTxt=Regex.group(Index)
  171. return returnTxt
  172. #取集数
  173. def get_EpisodesList(self,html,RegexText):
  174. ListRe=re.finditer(RegexText, html, re.M|re.S)
  175. videos = []
  176. for vod in ListRe:
  177. url = vod.group('url')
  178. title =vod.group('title')
  179. if len(url) == 0:
  180. continue
  181. if url.find('http:') <0:
  182. url='http://kt30.com'+url
  183. videos.append(title+"$"+url)
  184. return videos
  185. #取剧集区
  186. def get_lineList(self,Txt,mark,after):
  187. circuit=[]
  188. origin=Txt.find(mark)
  189. while origin>8:
  190. end=Txt.find(after,origin)
  191. circuit.append(Txt[origin:end])
  192. origin=Txt.find(mark,end)
  193. return circuit
  194. #正则取文本,返回数组
  195. def get_RegexGetTextLine(self,Text,RegexText,Index):
  196. returnTxt=[]
  197. ListRe=istRe=re.finditer(RegexText, Text, re.M|re.S)
  198. for value in ListRe:
  199. t=value.group(Index)
  200. if t==None:
  201. continue
  202. returnTxt.append(t)
  203. return returnTxt
  204. #分类取结果
  205. def get_list(self,html,patternTxt):
  206. ListRe=re.finditer(patternTxt, html, re.M|re.S)
  207. videos = []
  208. head="http://kt30.com"
  209. for vod in ListRe:
  210. url = vod.group('url')
  211. title =self.removeHtml(txt=vod.group('title'))
  212. img =vod.group('img')
  213. renew=vod.group('renew')
  214. if len(url) == 0:
  215. continue
  216. if len(img)<5:
  217. img='https://agit.ai/lanhaidixingren/Tvbox/raw/branch/master/CoverError.png'
  218. if self.get_RegexGetText(Text=img,RegexText='(https{0,1}:)',Index=1)=='':
  219. img=head+img
  220. # print(title)
  221. videos.append({
  222. "vod_id":"{0}###{1}###{2}".format(title,head+url,img),
  223. "vod_name":title,
  224. "vod_pic":img,
  225. "vod_remarks":renew
  226. })
  227. return videos
  228. #删除html标签
  229. def removeHtml(self,txt):
  230. soup = re.compile(r'<[^>]+>',re.S)
  231. txt =soup.sub('', txt)
  232. return txt.replace("&nbsp;"," ")
  233. #番剧
  234. def get_list_fanju(self,html):
  235. ListRe=re.finditer('class="jtxqj"><a href="(?P<url>.+?)" title="(?P<title>.+?)" target="_self">(?P<renew>.+?)</a>', html, re.M|re.S)
  236. videos = []
  237. head="http://ktkkt8.com"
  238. img='https://agit.ai/lanhaidixingren/Tvbox/raw/branch/master/%E5%B0%81%E9%9D%A2.jpeg'
  239. for vod in ListRe:
  240. url = vod.group('url')
  241. title =self.removeHtml(txt=vod.group('title'))
  242. renew=vod.group('renew')
  243. if len(url) == 0:
  244. continue
  245. videos.append({
  246. "vod_id":"{0}###{1}###{2}".format(title,head+url,img),
  247. "vod_name":title,
  248. "vod_pic":img,
  249. "vod_remarks":renew
  250. })
  251. return videos
  252. # T=Spider()
  253. # l=T.homeVideoContent()
  254. # l=T.searchContent(key='柯南',quick='')
  255. # l=T.categoryContent(tid='r',pg='1',filter=False,extend={})
  256. # for x in l['list']:
  257. # print(x['vod_id'])
  258. # mubiao= l['list'][1]['vod_id']
  259. # playTabulation=T.detailContent(array=[mubiao,])
  260. # # print(playTabulation)
  261. # vod_play_from=playTabulation['list'][0]['vod_play_from']
  262. # vod_play_url=playTabulation['list'][0]['vod_play_url']
  263. # url=vod_play_url.split('$$$')
  264. # vod_play_from=vod_play_from.split('$$$')[0]
  265. # url=url[0].split('$')
  266. # url=url[1].split('#')[0]
  267. # print(url)
  268. # m3u8=T.playerContent(flag=vod_play_from,id=url,vipFlags=True)
  269. # print(m3u8)