ProvinceChildJob.php 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. <?php
  2. namespace addons\RfDevTool\common\queues;
  3. use Yii;
  4. use yii\base\BaseObject;
  5. use yii\queue\JobInterface;
  6. use common\helpers\StringHelper;
  7. use common\models\common\Provinces;
  8. use linslin\yii2\curl\Curl;
  9. use QL\QueryList;
  10. use addons\RfDevTool\common\models\ProvinceGatherLog;
  11. /**
  12. * Class ProvinceChildJob
  13. * @package addons\RfDevTool\common\queues
  14. * @author jianyan74 <751393839@qq.com>
  15. */
  16. class ProvinceChildJob extends BaseObject implements JobInterface
  17. {
  18. /**
  19. * @var string
  20. */
  21. public $baseUrl;
  22. /**
  23. * @var int
  24. */
  25. public $maxLevel;
  26. /**
  27. * @var array
  28. */
  29. public $parent;
  30. /**
  31. * @var int
  32. */
  33. public $job_id;
  34. /**
  35. * 重连次数
  36. *
  37. * @var int
  38. */
  39. public $reconnection = 5;
  40. /**
  41. * @var int
  42. */
  43. public $level = 2;
  44. /**
  45. * 路径前缀
  46. *
  47. * @var
  48. */
  49. public $chlidPrefix;
  50. /**
  51. * @var string[]
  52. */
  53. public $range = [
  54. 2 => 'table.citytable td+td',
  55. 3 => 'table.countytable td+td',
  56. 4 => 'table.towntable td+td',
  57. 5 => 'table.villagetable .villagetr',
  58. ];
  59. /**
  60. * @param \yii\queue\Queue $queue
  61. * @return mixed|void
  62. */
  63. public function execute($queue)
  64. {
  65. /** @var QueryList $ql */
  66. $ql = QueryList::getInstance();
  67. // 注册一个myHttp方法到QueryList对象
  68. $ql->bind('http', function ($url) {
  69. $curl = new Curl();
  70. $html = $curl->get($url);
  71. $encode = mb_detect_encoding($html, ["ASCII", 'UTF-8', "GB2312", "GBK", 'BIG5']);
  72. $str_encode = mb_convert_encoding($html, 'UTF-8', $encode);
  73. $this->setHtml($str_encode);
  74. return $this;
  75. });
  76. /******************************** 社区 ********************************/
  77. // 东莞市、中山市、儋州市下面直接是镇所以规则要变
  78. if (isset($this->parent['code'][1])) {
  79. if ($this->parent['code'][1] >= 441900000 && $this->parent['code'][1] < 443000000) {
  80. // 社区
  81. $this->getVillage($ql);
  82. return;
  83. }
  84. if ($this->parent['code'][1] >= 460400000 && $this->parent['code'][1] < 460500000) {
  85. // 社区
  86. $this->getVillage($ql);
  87. return;
  88. }
  89. }
  90. if ($this->level >= 5) {
  91. // 社区
  92. $this->getVillage($ql);
  93. return;
  94. }
  95. /******************************** 市区县/街道 ********************************/
  96. $level = $this->level;
  97. // 东莞市、中山市、儋州市下面直接是镇所以规则要变
  98. if (isset($this->parent['code'][1]) && in_array($this->parent['code'][1], [4419, 4420, 4604])) {
  99. $level += 1;
  100. }
  101. $this->getCityCountyTown($ql, $level);
  102. }
  103. /**
  104. * 获取社区
  105. *
  106. * @param QueryList $ql
  107. */
  108. public function getVillage($ql)
  109. {
  110. $data = $ql->rules([
  111. 'id' => ['td:first', 'text'],
  112. 'title' => ['td:nth-child(3)', 'text']
  113. ])->http($this->parent['chlidLink'])->range($this->range[5])->query()->getData()->all();
  114. // 找不到数据库可能是抓取失败重新连接
  115. if (empty($data)) {
  116. $this->reconnection();
  117. return;
  118. }
  119. foreach ($data as &$datum) {
  120. if (empty($datum['id']) && empty($datum['title'])) {
  121. continue;
  122. }
  123. $datum['level'] = $this->parent['level'] + 1;
  124. $datum['pid'] = $this->parent['id'];
  125. $datum['tree'] = $this->parent['tree'] . $datum['pid'] . '-';
  126. // 写入数据库
  127. if (!($model = Provinces::findOne(['id' => $datum['id']]))) {
  128. $model = new Provinces();
  129. }
  130. $model->attributes = $datum;
  131. $model->save();
  132. }
  133. }
  134. /**
  135. * 获取市区街道/县
  136. *
  137. * @param QueryList $ql
  138. */
  139. public function getCityCountyTown($ql, $level)
  140. {
  141. $data = $ql->rules([
  142. 'title' => ['a', 'text'],
  143. 'link' => ['a', 'href']
  144. ])->http($this->parent['chlidLink'])->range($this->range[$level])->query()->getData()->all();
  145. $codeSuffix = $this->level == 2 ? '00' : '';
  146. // 找不到数据库可能是抓取失败重新连接
  147. if (empty($data)) {
  148. $this->reconnection();
  149. return;
  150. }
  151. foreach ($data as &$datum) {
  152. if (empty($datum['text']) && empty($datum['link'])) {
  153. continue;
  154. }
  155. $code = StringHelper::replace('.html', '', $datum['link']);
  156. $datum['code'] = explode('/', $code);
  157. // 地址前缀
  158. $chlidPrefix = $this->chlidPrefix;
  159. if (empty($chlidPrefix)) {
  160. $chlidPrefix = $datum['code'][0];
  161. } else {
  162. $chlidPrefix = $chlidPrefix . '/' . $datum['code'][0];
  163. }
  164. $datum['id'] = $datum['code'][1] . $codeSuffix;
  165. $datum['level'] = $this->parent['level'] + 1;
  166. $datum['pid'] = $this->parent['id'];
  167. $datum['tree'] = $this->parent['tree'] . $datum['pid'] . '-';
  168. $datum['chlidPrefix'] = $chlidPrefix;
  169. $datum['chlidLink'] = $this->baseUrl . $chlidPrefix . '/' . $datum['code'][1] . '.html';
  170. // 写入数据库
  171. if (!($model = Provinces::findOne(['id' => $datum['id']]))) {
  172. $model = new Provinces();
  173. }
  174. $model->attributes = $datum;
  175. $model->save();
  176. if ($datum['level'] + 1 <= $this->maxLevel) {
  177. $this->createJob($datum);
  178. }
  179. }
  180. }
  181. /**
  182. * 重连
  183. */
  184. protected function reconnection()
  185. {
  186. if ($this->reconnection <= 0) {
  187. $this->log('采集彻底失败');
  188. return;
  189. }
  190. $queue = new ProvinceChildJob([
  191. 'parent' => $this->parent,
  192. 'baseUrl' => $this->baseUrl,
  193. 'maxLevel' => $this->maxLevel,
  194. 'level' => $this->level,
  195. 'job_id' => $this->job_id,
  196. 'reconnection' => $this->reconnection - 1,
  197. ]);
  198. // 延迟60秒再运行
  199. $messageId = Yii::$app->queue->delay(1 * 60)->push($queue);
  200. $this->log('采集失败,等待重试时间60秒', $messageId);
  201. }
  202. /**
  203. * 记录日志
  204. */
  205. protected function log($remark, $message_id = 0)
  206. {
  207. $model = new ProvinceGatherLog();
  208. $model->data = $this->parent;
  209. $model->url = $this->baseUrl;
  210. $model->max_level = $this->maxLevel;
  211. $model->level = $this->level;
  212. $model->job_id = $this->job_id;
  213. $model->message_id = $message_id;
  214. $model->reconnection = $this->reconnection;
  215. $model->remark = $remark;
  216. $model->save();
  217. if (!$model->save()) {
  218. Yii::error(Yii::$app->services->base->analysisErr($model->getFirstErrors()));
  219. }
  220. }
  221. /**
  222. * 创建一个新队列
  223. *
  224. * @param $datum
  225. * @param $level
  226. */
  227. protected function createJob($datum)
  228. {
  229. $queue = new ProvinceChildJob([
  230. 'parent' => $datum,
  231. 'baseUrl' => $this->baseUrl,
  232. 'chlidPrefix' => $datum['chlidPrefix'],
  233. 'maxLevel' => $this->maxLevel,
  234. 'level' => $this->level + 1,
  235. 'job_id' => $this->job_id,
  236. ]);
  237. $messageId = Yii::$app->queue->push($queue);
  238. }
  239. }
粤ICP备19079148号