diff --git a/.gitignore b/.gitignore index 37fc9d4..00f0cc6 100644 --- a/.gitignore +++ b/.gitignore @@ -88,3 +88,7 @@ ENV/ # Rope project settings .ropeproject +# celery-beat +*.bak +*.dat +*.dir diff --git a/Algorithm/README.md b/Algorithm/README.md new file mode 100644 index 0000000..ecff713 --- /dev/null +++ b/Algorithm/README.md @@ -0,0 +1,6 @@ +1. [约瑟夫环](https://github.com/lambdaplus/python/blob/master/Algorithm/joseph-ring.py) +2. [寻找两个链表的第一个交点](https://www.zybuluo.com/Scrazy/note/719335) +3. [删除链表中的重复元素](https://www.zybuluo.com/Scrazy/note/720542) +4. [数组中的数组成最小值](https://www.zybuluo.com/Scrazy/note/720582) +5. [索引为index 的丑数](https://www.zybuluo.com/Scrazy/note/720587) +6. [反转链表](https://www.zybuluo.com/Scrazy/note/721436) diff --git a/Algorithm/arry.md b/Algorithm/arry.md deleted file mode 100644 index bd91a35..0000000 --- a/Algorithm/arry.md +++ /dev/null @@ -1,30 +0,0 @@ -# 题目 -调整数组顺序使奇数位于偶数前面 -## 描述 -输入一个整数数组,实现一个函数来调整该数组中数字的顺序,使得所有的奇数位于数组的前半部分,所有的偶数位于位于数组的后半部分,并保证奇数和奇数,偶数和偶数之间的相对位置不变。 - -马上熄灯啦,直接贴渣渣代码!! -```python -# -*- coding: utf-8 -*- -import random - - -def foo(L): - L1 = [] - L2 = [] - for i in L: - if i % 2 != 0: - L1.append(i) - else: - L2.append(i) - - return L1 + L2 - -if __name__ == '__main__': - L = [random.randrange(100) for _ in range(10)] - print(L) - print(foo(L)) - -``` -代码很渣,有空继续优化!! -就酱。。。洗裤子去啦。。。。。。。。 diff --git a/Algorithm/binary_search.md b/Algorithm/binary_search.md index 4a9ab49..9418fb7 100644 --- a/Algorithm/binary_search.md +++ b/Algorithm/binary_search.md @@ -1,5 +1,4 @@ -# 二分法查找 - +# 二分法查找 二分法查找,顾名思义,二分、二分就是分成两半呗。(有的翻译是`折半法搜索`比如SICP里翻译的就是`折半法搜索`)。它的复杂度为O(logn),在列表(已排序)中对给定值`value`进行查找并输出其索引(index)值。 @@ -9,15 +8,14 @@ def binary_search(lst, value): left, right = 0, len(lst) - 1 - + while left <= right: - middle = int((left + right) / 2) # 取`lst`中值索引 - + middle = (left + right) // 2 # 取`lst`中值索引 + if value > lst[middle]: left = middle + 1 # value大于`lst`中值,让左边界等于 middle + 1 elif value < lst[middle]: right = middle - 1 # 类似上 - else: return "The value's index is {}".format(middle) return "There is no {}".format(value) @@ -26,16 +24,16 @@ if __name__ == '__main__': lst = [1, 3, 5, 7, 9] value = int(input("Please input the value(1-10): ")) print(binary_search(lst, value)) - ``` 再来个递归(recursion)版的吧, 不作过多解释啦! + ```python # -*- coding: utf-8 -*- def binary_search_rec(lst, value, left, right): - middle = int((left + right) / 2) + middle = (left + right) // 2 if left > right: return "I'm sorry, there is no {}".format(value) @@ -44,7 +42,7 @@ def binary_search_rec(lst, value, left, right): return binary_search_rec(lst, value, left, middle - 1) elif value > lst[middle]: - return binary_search_rec(lst, value, middle + 1, right) + return binary_search_rec(lst, value, middle + 1, right) else: return "Congratulations, the value's({}) index is {}".format(value, middle) diff --git a/Algorithm/joseph-ring.py b/Algorithm/joseph-ring.py new file mode 100644 index 0000000..46dd733 --- /dev/null +++ b/Algorithm/joseph-ring.py @@ -0,0 +1,16 @@ +# coding: utf-8 +# %load python/Algorithm/joseph-ring.py +def joseph_ring(n, m): + if n < 1: + return -1 + + result = -1 + start = 0 + + ring_num = list(range(n)) + while ring_num: + k = (start + m - 1) % n + result = ring_num.pop(k) + n -= 1 + start = k + return result diff --git a/Algorithm/merge-two-list.md b/Algorithm/merge-two-list.md index 7453e67..acb55eb 100644 --- a/Algorithm/merge-two-list.md +++ b/Algorithm/merge-two-list.md @@ -8,22 +8,19 @@ from random import randrange def merge_two_list(lst1, lst2): + rst = [] while lst1 and lst2: - lst3.append(lst1.pop(0) if lst1[0] <= lst2[0] else lst2.pop(0)) - - while lst1: - lst3.append(lst1.pop(0)) - - while lst2: - lst3.append(lst2.pop(0)) - - return lst3 + if lst1[-1] > lst2[-1]: + rst.append(lst1.pop()) + else: + rst.append(lst2.pop()) + rst.reverse() + return (lst1 or lst2) + rst if __name__ == "__main__": - lst3 = [] lst1 = sorted([randrange(100) for _ in range(10)]) - print(lst1) + print('lst1 is : ', lst1) lst2 = sorted([randrange(100) for _ in range(6)]) - print(lst2) - print(merge_two_list(lst1, lst2)) + print('lst2 is : ', lst2) + print('The merged list is\n', merge_two_list(lst1, lst2)) ``` diff --git a/Algorithm/reverse_print_link_list.py b/Algorithm/reverse_print_link_list.py new file mode 100644 index 0000000..0a270bc --- /dev/null +++ b/Algorithm/reverse_print_link_list.py @@ -0,0 +1,18 @@ +# -*- coding:utf-8 -*- +class ListNode: + def __init__(self, x): + self.val = x + self.next = None + +class Solution: + # 返回从尾部到头部的列表值序列,例如[1,2,3] + def printListFromTailToHead(self, listNode): + # write code here + res = [] + if listNode is None: + return res + while listNode: + res.append(listNode.val) + listNode = listNode.next + res.reverse() + return res diff --git a/Algorithm/sort/bubble-sort.py b/Algorithm/sort/bubble-sort.py index db27018..d91f215 100644 --- a/Algorithm/sort/bubble-sort.py +++ b/Algorithm/sort/bubble-sort.py @@ -7,7 +7,7 @@ def bubble_sort(L): 冒泡排序主要使用两次循环实现排序。 外循环中的一个数字依次与内层循环中的每个数字进行比较,如果索引值小的数字大于索引值大的数字,交换位置。否则,位置不变。直至外循环结束。 ''' - if len(L) < 2: # 列表内元素低于2,直接返回! + if len(L) < 2: # 列表内元素低于2,直接返回! return L for i in range(len(L)): for j in range(1, len(L)): @@ -15,6 +15,7 @@ def bubble_sort(L): L[j - 1], L[j] = L[j], L[j - 1] return L + if __name__ == '__main__': L = [randrange(1000) for _ in range(10)] print(bubble_sort(L)) diff --git a/Algorithm/sort/insert_sort.py b/Algorithm/sort/insert_sort.py index 29681f6..27b69f0 100644 --- a/Algorithm/sort/insert_sort.py +++ b/Algorithm/sort/insert_sort.py @@ -1,9 +1,12 @@ # coding=utf-8 + from random import randrange + def insert_sort(L): if len(L) < 2: return L + for i in range(1, len(L)): tmp = L[i] j = i - 1 @@ -14,6 +17,6 @@ def insert_sort(L): return L -#if __name__ == '__main__': +# if __name__ == '__main__': # L = [randrange(1000) for _ in range(10)] # print(insert_sort(L)) diff --git a/Algorithm/sort/merge/merge-sort.py b/Algorithm/sort/merge/merge-sort.py index b6045ad..d2947fb 100644 --- a/Algorithm/sort/merge/merge-sort.py +++ b/Algorithm/sort/merge/merge-sort.py @@ -1,6 +1,7 @@ # -*-coding: utf-8 -*- from random import randrange + def merge_sort(L): if len(L) < 2: return L @@ -9,21 +10,17 @@ def merge(left, right): merged = [] while left and right: - merged.append(left.pop(0) if left[0] <= right[0] else right.pop(0)) - - while left: - merged.append(left.pop(0)) + merged.append(left.pop(0) if left[0] <= right[0] + else right.pop(0)) - while right: - merged.append(right.pop(0)) - - return merged + return merged + (left or right) - mid = int(len(L)/2) + mid = len(L) // 2 left = merge_sort(L[:mid]) right = merge_sort(L[mid:]) return merge(left, right) + if __name__ == "__main__": L = [randrange(1000) for _ in range(10)] print(merge_sort(L)) diff --git a/Algorithm/sort/merge/merge-sort1.py b/Algorithm/sort/merge/merge-sort1.py index c223c61..bd82a5b 100644 --- a/Algorithm/sort/merge/merge-sort1.py +++ b/Algorithm/sort/merge/merge-sort1.py @@ -7,29 +7,32 @@ merge(*iterables, key=None, reverse=False) Merge multiple sorted inputs into a single sorted output. - + Similar to sorted(itertools.chain(*iterables)) but returns a generator, does not pull the data into memory all at once, and assumes that each of the input streams is already sorted (smallest to largest). - + >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25])) [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25] - + If *key* is not None, applies a key function to each element to determine its sort order. - + >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len)) ['dog', 'cat', 'fish', 'horse', 'kangaroo'] ''' + + def merge_sort(L): if len(L) < 2: return L - mid = int(len(L)/2) + mid = len(L) // 2 left = merge_sort(L[:mid]) right = merge_sort(L[mid:]) return list(merge(left, right)) + if __name__ == "__main__": L = [randrange(100) for _ in range(10)] print(merge_sort(L)) diff --git a/Algorithm/sort/merge/merge-sort2.py b/Algorithm/sort/merge/merge-sort2.py new file mode 100644 index 0000000..0fbc286 --- /dev/null +++ b/Algorithm/sort/merge/merge-sort2.py @@ -0,0 +1,26 @@ +# coding=utf-8 +# 性能更好 +from random import randrange + + +def merge_sort(seq): + mid = len(seq) // 2 + lft, rgt = seq[:mid], seq[mid:] + if len(lft) > 1: + lft = merge_sort(lft) + if len(rgt) > 1: + rgt = merge_sort(rgt) + + res = [] + while lft and rgt: + if lft[-1] >= rgt[-1]: # 取lft和rgt序列中最大的值 + res.append(lft.pop()) + else: + res.append(rgt.pop()) + res.reverse() # 反序一下 + return (lft or rgt) + res + + +if __name__ == '__main__': + seq = [randrange(100) for _ in range(10)] + print(merge_sort(seq)) diff --git a/Algorithm/sort/quick-sort.py b/Algorithm/sort/quick-sort.py index 67fd1f2..2efc0c3 100644 --- a/Algorithm/sort/quick-sort.py +++ b/Algorithm/sort/quick-sort.py @@ -1,15 +1,17 @@ # coding=utf-8 import random + def quick_sort(seq): if len(seq) < 2: return seq mid = random.choice(seq) - small = [x for x in seq if x < mid] + small = [x for x in seq if x <= mid] big = [x for x in seq if x > mid] return quick_sort(small) + [mid] + quick_sort(big) + if __name__ == '__main__': L = [random.randrange(1000) for _ in range(10)] print(quick_sort(L)) diff --git a/Algorithm/sort/select-sort.py b/Algorithm/sort/select-sort.py new file mode 100644 index 0000000..fe2e67c --- /dev/null +++ b/Algorithm/sort/select-sort.py @@ -0,0 +1,21 @@ +# coding=utf-8 +from random import randrange + + +def select_sort(seq): + if len(seq) < 2: + return seq + + for i in range(len(seq)-1, 0, -1): + max_j = i + for j in range(i): + if seq[j] > seq[max_j]: + max_j = j + seq[i], seq[max_j] = seq[max_j], seq[i] + + return seq + + +if __name__ == '__main__': + seq = [randrange(100) for _ in range(10)] + print(select_sort(seq)) diff --git a/Algorithm/sort/shell-sort.py b/Algorithm/sort/shell-sort.py new file mode 100644 index 0000000..7f35d1d --- /dev/null +++ b/Algorithm/sort/shell-sort.py @@ -0,0 +1,18 @@ +#coding=utf-8 +def shell_sort(seq): + if len(seq) < 2: + return seq + + n = len(seq) + mid = n // 2 + while mid > 0: + for i in range(mid, n): + tmp = seq[i] + j = i + while j >= mid and seq[j-mid] > tmp: + seq[j] = seq[j-mid] + j -= mid + seq[j] = tmp + mid = mid // 2 + return seq + diff --git a/README.md b/README.md index ce619f3..045d46e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,16 @@ # Python -python练习中的一些代码,以防重装系统再次丢失 1. [几种排序算法](https://github.com/lambdaplus/python/tree/master/Algorithm/sort) 2. [翻转列表](https://github.com/lambdaplus/python/blob/master/resver.md) 3. [二分法查找](https://github.com/lambdaplus/python/blob/master/Algorithm/binary_search.md) +4. [一个异步爬虫](https://github.com/lambdaplus/python/blob/master/%E4%B8%80%E4%B8%AA%E5%BC%82%E6%AD%A5%E7%88%AC%E8%99%AB.md) +5. [豆瓣电影Top250爬虫](https://github.com/lambdaplus/python/blob/master/%E8%B1%86%E7%93%A3%E7%94%B5%E5%BD%B1Top250%20%E7%88%AC%E8%99%AB.md) +6. [装饰器](https://www.zybuluo.com/Scrazy/note/551565) +7. [二叉树](https://www.zybuluo.com/Scrazy/note/390264) +8. [Celery](https://www.zybuluo.com/Scrazy/note/697561) +9. [RabbitMQ](https://www.zybuluo.com/Scrazy/note/699512) +10. [Python与数据库](https://www.zybuluo.com/Scrazy/note/702005) +11. [新浪博客抓取及简单聚类](https://www.zybuluo.com/mdeditor#709348) +12. [Python 算法教程 笔记](https://www.zybuluo.com/Scrazy/note/530998) +13. [TCP简述](https://www.zybuluo.com/Scrazy/note/717147) +14. [单例模式](https://www.zybuluo.com/Scrazy/note/719017) diff --git a/binary_search.md b/binary_search.md new file mode 100644 index 0000000..7e7034b --- /dev/null +++ b/binary_search.md @@ -0,0 +1,136 @@ + +--- +title: 二分法查找 +date: 2016-09-23 15:32:24 +tags: Algorithm +--- + +二分法查找,顾名思义,二分、二分就是分成两半呗。(有的翻译是`折半法搜索`比如SICP里翻译的就是`折半法搜索`)。它的复杂度为O(logn),在列表(已排序)中对给定值`value`进行查找并输出其索引(index)值。 + +```python +# -*- coding: utf-8 -*- + + +def binary_search(lst, value): + left, right = 0, len(lst) - 1 + + while left <= right: + middle = int((left + right) / 2) # 取`lst`中值索引 + + if value > lst[middle]: + left = middle + 1 # value大于`lst`中值,让左边界等于 middle + 1 + elif value < lst[middle]: + right = middle - 1 # 类似上 + + else: + return "The value's index is {}".format(middle) + return "There is no {}".format(value) + +if __name__ == '__main__': + lst = [1, 3, 5, 7, 9] + value = int(input("Please input the value(1-10): ")) + print(binary_search(lst, value)) + +``` + +再来个递归(recursion)版的吧, 不作过多解释啦! +```python +# -*- coding: utf-8 -*- + + +def binary_search_rec(lst, value, left, right): + middle = int((left + right) / 2) + + if left > right: + return "I'm sorry, there is no {}".format(value) + + if value < lst[middle]: + return binary_search_rec(lst, value, left, middle - 1) + + elif value > lst[middle]: + return binary_search_rec(lst, value, middle + 1, right) + + else: + return "Congratulations, the value's({}) index is {}".format(value, middle) + + +if __name__ == '__main__': + lst = [1, 3, 5, 7, 9] + left, right = 0, len(lst) + value = int(input("Please input the value: ")) + print(binary_search_rec(lst, value, left, right)) +``` +没事。温习以下二分搜索! + +被拼写错误折磨了一晚上。好好的lft被我写成ltf。debug生无可恋! +```python +from random import randrange +def binary_search(seq, sit, lft, rgt): + mid = (lft + rgt) // 2 + + if lft > rgt: + return 'The seq no {}'.format(sit) + + if sit > seq[mid]: + return binary_search(seq, sit, mid+1, rgt) + + elif sit < seq[mid]: + return binary_search(seq, sit, lft, mid-1) + + else: + return 'The {} in the seq and the station is {}'.format(sit, mid) + +if __name__ == '__main__': + seq = [1, 4, 6, 8, 9, 12, 44, 56] + lft, rgt = 0, len(seq) + print(binary_search(seq, 4, lft, rgt)) +``` +昨天面试,面试官出了一道算法题: + +> 有一个数组,其内元素先递增后递减,请找出其中的最大值. + +对于我来说,当时第一个想起来的是,排序但是转念间就知道肯定不是最好的啦.于是就在哪儿想啊想,还是想不起来.气氛挺尴尬的,外面也挺冷的(电话面试,外面安静).我想不起来,面试小哥也不急着催我,最后也算是在小哥的提示下,想起了怎么做啦!(太感谢小哥啦, 小哥好人! 喂, 你们几个不许笑啊喂!) + +当然是**二分**啦,下面是算法实现! + +```python +# coding=utf-8 +def search_max_num(seq, left, right): + mid = (right + left) // 2 + if left > right: + return seq[mid] + if seq[mid] > seq[mid - 1]: + return search_max_num(seq, mid + 1, right) + else: + return search_max_num(seq, left, mid - 1) +if __name__ == "__main__": + seq = [32, 55, 54, 54, 54, 54, 32, 15, 6, 4, 2, 1] + print(search_max_num(seq, 0, len(seq))) +``` +### 二维数组的查找 +class Solution: + # array 二维列表 + + def find(self, target, array): + # write code here + for arr in array: + lft, rgt =0, len(arr) - 1 + while lft <= rgt: + mid = (lft + rgt) // 2 + if target > arr[mid]: + lft = mid + 1 + elif target < arr[mid]: + rgt = mid - 1 + else: + return arr[mid] + return 'No target' + + +target = 8 +array = [ + [1, 3, 5, 7, 9], + [2, 4, 6, 8, 10] +] +solution = Solution() +solution.find(target, array) +``` diff --git a/binary_tree.md b/binary_tree.md new file mode 100644 index 0000000..9aa4289 --- /dev/null +++ b/binary_tree.md @@ -0,0 +1,354 @@ +--- +更新 `2017-03-22` +--- + +畏惧了好久的二叉树,终于在近两天开搞了。二分法查找已在前几天完成,磨刀霍霍向猪羊,吼吼吼! 何为二叉树?按照我目前的理解就是类似于发叉的树,树干上发两个叉或者一个(不发叉的树真不到有何用处),发叉的地方称为**节点**。然后发的两个叉又可以继续像树干一样发叉,新发的叉有可以继续发叉,子又生子,孙又生孙,无穷尽也!但是**树的左边的叉的值小于节点值,右边的大于节点值**。 + +本文参考: [老齐的Github](https://github.com/qiwsir/algorithm/blob/master/binary_tree.md) + +首先,建立一棵树。 + +```python +class Node: + def __init__(self, data): + self.left = None + self.right = None + self.data = data +``` + +这样,光秃秃的小树苗就种好了。接着就是茁长生长啦。浇水去喽! + +```python +class Node: + ''' + ... + ''' + def insert(self, data): + if data < self.data: # 树叉小于节点 + if self.left is None: # 并且左面的树叉为空 + self.left = Node(data) # 当仁不让的插入 + else: # 非空的话 + self.left.insert(data) # 以左树叉为节点继续插入 + + elif data > self.data: + if self.right is None: + self.right = Node(data) + else: + self.right.insert(data) + else: + self.data = data +``` + +浇完水后,小树苗噌噌的往上窜啊。 + +```python +class Node: + ''' + 省略上述代码 + ''' + def search(self, data, parent=None): + ''' + data为目标查询值,同时返回parent(父节点)便于定位。 + ''' + if data < self.data: + if self.left is None: + return None, None + else: + return self.left.search(data, self) + + elif data > self.data: + if self.right is None: + return None, None + + return self.right.search(data, self) + else: + # return self.data, parent.data + return self, parent +``` + +树苗生长的那么好,想看看每个叉上都是啥呀,来来来,抬头往上看((其实是往下看啦)。 + +```python +def print_tree(self): + if self.left: + self.left.print_tree() + print(self.data) + if self.right: + self.right.print_tree() +``` + +树的遍历又分为以下三种: + +1. 前序(root -> left -> right) +2. 中序(left -> root -> right) +3. 后序(left -> right -> root) + +调整`print_tree`函数里 `print(self.data)` 的顺序即可实现三种遍历方式。 + +转眼间小树苗涨的太旺盛了,疯涨啊!!怎么办呢,剪几个枝吧。别怪我哦,小树苗! 删除节点时,有三种可能的情况: + +1. 目标节点下没有任何节点(0个) +2. 目标节点下有一个节点 +3. 目标节点下有两个节点 + +判断节点数目程序如下: + +```python +class Node: +''' +省略代码 +''' +def chrildren(self): + count = 0 + if self.left: + count += 1 + + if self.right: + count += 1 + + return count +``` + +接下来就是删除操作啦。哦吼吼。 + +```python +class Node: +''' +省略 +''' + +def delete(self, data): + node, parent = self.search(data) + chrildren = node.chrildren() # 子节点数目 + if chrildren == 0: # 情况 1, 没有子节点,直接删除即可 + if parent.left is node: # 判断目标节点是其父节点的 左or右 节点 + parent.left = None + else: + parent.right = None + del node + + elif chrildren == 1: # 情况 2, 有一个子节点,用子节点替换其即可 + if node.left: + tmp = node.left + else: + tmp = node.right + if parent: + if parent.left is node: + parent.left = tmp + else: + parent.right = tmp + del node + else: + ''' + 第三种情况比较复杂: + 1\. 左节点0个子节点 + 2\. 左节点1个子节点 + 3\. 左节点2个子节点 + ''' + parent = node + successor = node.right + while successor.left: # 递归思想,直至找到'最左'的子节点, 保持树的平衡,用右子节点的值替换 + parent = successor + successor = successor.left + node.data = successor.data + if parent.left == successor: + parent.left = successor.right + else: + parent.right = successor.right + +# 接下来可以测试以下种的树怎么样啦。 + +root = Node(11) root.insert(14) root.insert(9) root.insert(9) root.insert(7) root.insert(10) root.insert(4) root.insert(5) root.insert(6) root.insert(8) value, parent = root.search(10) print(value.data, parent.data) root.print_tree() print('_'_ 20) root.delete(4) root.print_tree() + +``` +把自己理解的部分写了写。当做练习,就先当个α版吧。 +`2016-05-28` + + +基本搞明白了 +完整代码[在这里](https://github.com/lambdaplus/python/blob/master/binary_tree.py) + +### 广度遍历和深度遍历二叉树! + +```python +def lookup(root): + stack = [root] + while stack: + current = stack.pop() + print(current.data) + if current.left: + stack.append(current.left) + if current.right: + stack.append(current.right) + + +def deep(root): + if not root: + return + deep(root.left) + deep(root.right) + print(root.data) +``` +### 求最大树深 + +```python +# -*- coding:utf-8 -*- +class TreeNode: + def __init__(self, x): + self.val = x + self.left = None + self.right = None + +class Solution: + def TreeDepth(self, pRoot): + if not pRoot: + return 0 + return max(self.TreeDepth(pRoot.left), self.TreeDepth(pRoot.right)) + 1 +``` + +### 比较两棵树是否相同 + +```python +def is_same(t1, t2): + if t1 == None and t2 == None: + return True + elif t1 and t2: + return t1.data == t2.data and is_same(t1.left, t2.left)\ + and is_same(t1.right, t2.right) + else: + return False +``` + +### 已知前序中序求后序 + +前面说到: +前序: root -> left -> right +中序: left -> root -> right +后序: left -> right -> root + +前序: 第一个值 A 即为根节点 +中序: A 的左边全为左子树,右边全是右子树 + +```python +def pre_in_post(pre_order, in_order): + if not pre_order: + return + post = Node(pre_order[0]) + index = in_order.index(pre_order[0]) + post.left = pre_in_post(pre_order[1:index+1], in_order[:index]) + post.right = pre_in_post(pre_order[index+1:], in_order[index+1:]) + return post +``` +### 已知前序中序构造出树 +```python +# -*- coding:utf-8 -*- +class TreeNode: + def __init__(self, x): + self.val = x + self.left = None + self.right = None + +class Solution: + # 返回构造的TreeNode根节点 + def reConstructBinaryTree(self, pre, tin): + # write code here + if not pre: + return + tree = TreeNode(pre[0]) + index = tin.index(pre[0]) + tree.left = self.reConstructBinaryTree(pre[1:index+1],tin[:index]) + tree.right = self.reConstructBinaryTree(pre[index+1:],tin[index+1:]) + return tree + + @classmethod + def print_tree(cls, tree): + if tree: + cls.print_tree(tree.left) + cls.print_tree(tree.right) + print(tree.val) + +pre = [1,2,3,4,5,6,7] +tin = [3,2,4,1,6,5,7] +s = Solution() +t = s.reConstructBinaryTree(pre, tin) +s.print_tree(t) +``` +### 树的子结构 + +```python +求pRoot2 的子树是否为 pRoot2 +# -*- coding:utf-8 -*- +# class TreeNode: +# def __init__(self, x): +# self.val = x +# self.left = None +# self.right = None +class Solution: + def is_subtree(self, t1, t2): + if not t2: # t2 is None 其为子树 + return True + if not t1: + return False + if not t1.val == t2.val: + return False + return self.is_subtree(t1.left, t2.left) and self.is_subtree(t1.right, t2.right) + + def HasSubtree(self, pRoot1, pRoot2): + # write code here + result = False + if pRoot1 and pRoot2: + if pRoot1.val == pRoot2.val: + result = self.is_subtree(pRoot1, pRoot2) + if not result: + result = self.is_subtree(pRoot1.left, pRoot2) + if not result: + result = self.is_subtree(pRoot1.right, pRoot2) + return result +``` +### 对称二叉树 + +``` +# -*- coding:utf-8 -*- +# class TreeNode: +# def __init__(self, x): +# self.val = x +# self.left = None +# self.right = None +class Solution: + + def isSymmetrical(self, pRoot): + def is_same(p1, p2): + if not (p1 or p2): + return True + elif p1 and p2 and p1.val == p2.val: + return is_same(p1.left, p2.right) and is_same(p1.right, p2.left) + return False + + if not pRoot: + return True + return is_same(pRoot.left, pRoot.right) +``` +### 二叉树镜像 + +``` +# -*- coding:utf-8 -*- +# class TreeNode: +# def __init__(self, x): +# self.val = x +# self.left = None +# self.right = None +class Solution: + # 返回镜像树的根节点 + def Mirror(self, root): + # write code here + if not root: + return None + elif not (root.left or root.right): + return root + + root.left, root.right = root.right, root.left + if root.left: + self.Mirror(root.left) + if root.right: + self.Mirror(root.right) +``` diff --git a/binary_tree.py b/binary_tree.py new file mode 100644 index 0000000..876a35b --- /dev/null +++ b/binary_tree.py @@ -0,0 +1,128 @@ +class Node: + + def __init__(self, data): + self.left = None + self.right = None + self.data = data + + def insert(self, data): + if data < self.data: # 树叉小于节点 + if self.left is None: # 并且左面的树叉为空 + self.left = Node(data) # 当仁不让的插入 + else: # 非空的话 + self.left.insert(data) # 以左树叉为节点继续插入 + + elif data > self.data: + if self.right is None: + self.right = Node(data) + else: + self.right.insert(data) + else: + self.data = data + + def search(self, data, parent=None): + ''' + data为目标查询值,同时返回parent(父节点)便于定位。 + ''' + if data < self.data: + if self.left is None: + return None, None + else: + return self.left.search(data, self) + + elif data > self.data: + if self.right is None: + return None, None + + return self.right.search(data, self) + else: + # return self.data, parent.data + return self, parent + + def print_tree_in(self): # 中序 + if self.left: + self.left.print_tree_in() + print(self.data) + if self.right: + self.right.print_tree_in() + + def print_tree_pre(self): # 前序 + print(self.data) + if self.left: + self.left.print_tree_pre() + if self.right: + self.right.print_tree_pre() + + def print_tree_post(self): # 后序 + if self.left: + self.left.print_tree_post() + if self.right: + self.right.print_tree_post() + print(self.data) + + def chrildren(self): + count = 0 + if self.left: + count += 1 + + if self.right: + count += 1 + + return count + + def delete(self, data): + node, parent = self.search(data) + chrildren = node.chrildren() # 子节点数目 + if chrildren == 0: # 情况 1, 没有子节点,直接删除即可 + if parent.left is node: # 判断目标节点是其父节点的 左or右 节点 + parent.left = None + else: + parent.right = None + del node + + elif chrildren == 1: # 情况 2, 有一个子节点,用子节点替换其即可 + if node.left: + tmp = node.left + else: + tmp = node.right + if parent: + if parent.left is node: + parent.left = tmp + else: + parent.right = tmp + del node + else: + ''' + 第三种情况比较复杂: + 1. 左节点0个子节点 + 2. 左节点1个子节点 + 3. 左节点2个子节点 + ''' + parent = node + successor = node.right + while successor.left: # 递归思想,直至找到最左的子节点, 保持树的平衡,用右子节点的值替换 + parent = successor + successor = successor.left + node.data = successor.data + if parent.left == successor: + parent.left = successor.right + else: + parent.right = successor.right + +# 接下来可以测试以下种的树怎么样啦。 +root = Node(11) +root.insert(14) +root.insert(9) +root.insert(9) +root.insert(7) +root.insert(10) +root.insert(4) +root.insert(5) +root.insert(6) +root.insert(8) +value, parent = root.search(10) +print(value.data, parent.data) +root.print_tree_in() +print('*' * 20) +root.delete(4) +root.print_tree_in() diff --git a/category_all.py b/category_all.py new file mode 100644 index 0000000..ce03fd4 --- /dev/null +++ b/category_all.py @@ -0,0 +1,62 @@ +# coding: utf-8 +import re +import concurrent.futures +import requests +from bs4 import BeautifulSoup as bs +from pymongo import MongoClient + + +def fetch(url): + res = requests.get(url) + res.encoding = 'gbk' + content = bs(res.text, 'lxml') + return content + + +def base_info(html): + pattern = re.compile(r'http://blog.sina.com.cn/s/blog_.*\.html') + links = re.findall(pattern, str(html)) + date_ = re.findall(r'\((\d{2,}.*)\)', str(html)) + tle_auth = html.select('li') + authes = (auth.text.split(' ')[0] for auth in tle_auth) + titles = (title.text.split(' ')[-1] for title in tle_auth) + for infos in zip(links, titles, authes, date_): + yield infos + + +def save(url): + html = fetch(url) + data = base_info(html) + client = MongoClient('localhost', 27017) + db = client.infos + coll = db.coll + for num, d in enumerate(data, 1): + datum = { + 'links': d[0], + 'title': d[1], + 'auther': d[2], + 'date': d[3] + } + + count = coll.find({'links': d[0]}).count() + if count == 0: + coll.insert_one(datum) + print('{} is grabbed'.format(url)) + + +if __name__ == '__main__': + url = 'http://roll.blog.sina.com.cn/list/other/index_{}.shtml' + + start = int(input('请输入开始页数, 默认为1 >> ')) + if not start: + start = 1 + + end = int(input('输入结束页数, 默认为100 >> ')) + if not end: + end = 100 + + pages = range(start, end + 1) + urls = [url.format(page) for page in pages] + + with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor: + executor.map(save, urls) diff --git a/category_test.py b/category_test.py new file mode 100644 index 0000000..af8a95e --- /dev/null +++ b/category_test.py @@ -0,0 +1,72 @@ +# coding=utf-8 + +import sys +reload(sys) +sys.setdefaultencoding('utf-8') + +import re +import csv +import requests +import jieba +import jieba.analyse +from bs4 import BeautifulSoup as bs +from tgrocery import Grocery +from train_txt_5 import train_src + + +def artical_content(url): + rsp = requests.get(url) + rsp.encoding = 'utf-8' + html = bs(rsp.text, 'lxml') + # artical = html.select('#sina_keyword_ad_area2') + artical = html.select('.h1_tit') + if len(artical) > 0: + content = artical[0].text + else: + content = html.select('.SG_txta')[0].text + # content = '' + return content.strip() + + +def category(urls): + for url in urls: + artical = artical_content(url) + cate2 = new_grocery.predict(artical) + yield cate2.predicted_y + + +def unzip(seq, L=None): + if L is None: + L = [] + for s in seq: + if not isinstance(s, (list, )): + L.append(s) + else: + L.extend(unzip(s)) + return L + + +if __name__ == "__main__": + + grocery = Grocery('sample') + grocery.train(train_src) + grocery.save() + new_grocery = Grocery('sample') + new_grocery.load() + + L1 = [] + with open('/home/mouse/Downloads/female.csv', 'r') as f1: + f1_csv = csv.reader(f1) + for row in f1_csv: + L1.append(row[0]) + # print(len(L1)) + + cate = category(L1) + i = 1 + with open('/home/mouse/infoss.csv', 'w') as f: + f_csv = csv.writer(f) + for row in zip(L1, cate): + f_csv.writerow(unzip(row)) + print 'Writing now, please waiting...{}'.format(str(i)) + i += 1 + print 'Done' diff --git a/celeries/proj/__init__.py b/celeries/proj/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/celeries/proj/celery.py b/celeries/proj/celery.py new file mode 100644 index 0000000..acd9617 --- /dev/null +++ b/celeries/proj/celery.py @@ -0,0 +1,10 @@ +# coding=utf-8 +from __future__ import absolute_import +from celery import Celery + +app = Celery('proj', include=['proj.tasks']) +app.config_from_object('proj.celeryconfig') + + +if __name__ == '__main__': + app.start() diff --git a/celeries/proj/celeryconfig.py b/celeries/proj/celeryconfig.py new file mode 100644 index 0000000..864e2f0 --- /dev/null +++ b/celeries/proj/celeryconfig.py @@ -0,0 +1,7 @@ +# coding=utf-8 +BROKER_URL = 'amqp://localhost' # RabbitMQ 作为消息代理 +CELERY_RESULT_BACKEND = 'redis://localhost' # Redis 作为结果存储 +CELERY_TASK_SERIALIZER = 'msgpack' +# 任务序列化和反序列化格式为 msgpack, 别忘了安装 msgpack-python +CELERY_RESULT_SERIALIZER = 'json' # 结果存储序列化格式为 json +CELERY_ACCEPT_CONTENT = ['msgpack', 'json'] # 任务接受格式类型 diff --git a/celeries/proj/tasks.py b/celeries/proj/tasks.py new file mode 100644 index 0000000..16d3ccb --- /dev/null +++ b/celeries/proj/tasks.py @@ -0,0 +1,35 @@ +# coding=utf-8 +from __future__ import absolute_import + +from .celery import app + +from celery.utils.log import get_task_logger + +logger = get_task_logger(__name__) + + +@app.task +def add(x, y): + return x + y + + +@app.task +def mul(x, y): + return x * y + + +@app.task(bind=True) +def div(self, x, y): + logger.info( + ''' + Executing task : {0.id} + task.args : {0.args!r} + task.kwargs : {0.kwargs!r} + '''.format(self.request) + ) + try: + res = x / y + except ZeroDivisionError as e: + raise self.retry(exc=e, countdown=3, max_retries=3) + else: + return res diff --git a/celeries/projb/__init__.py b/celeries/projb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/celeries/projb/celery.py b/celeries/projb/celery.py new file mode 100644 index 0000000..fb02e54 --- /dev/null +++ b/celeries/projb/celery.py @@ -0,0 +1,10 @@ +# coding=utf-8 +from __future__ import absolute_import +from celery import Celery + +app = Celery('projb', include=['projb.tasks']) +app.config_from_object('projb.celeryconfig') + + +if __name__ == '__main__': + app.start() diff --git a/celeries/projb/celeryconfig.py b/celeries/projb/celeryconfig.py new file mode 100644 index 0000000..d661cc5 --- /dev/null +++ b/celeries/projb/celeryconfig.py @@ -0,0 +1,28 @@ +# coding=utf-8 +from kombu import Queue + +BROKER_URL = 'amqp://localhost' # RabbitMQ 作为消息代理 +CELERY_RESULT_BACKEND = 'redis://localhost:6379/0' # Redis 作为结果存储 +CELERY_TASK_SERIALIZER = 'msgpack' +# 任务序列化和反序列化格式为 msgpack, 别忘了安装 msgpack-python +CELERY_RESULT_SERIALIZER = 'json' # 结果存储序列化格式为 json +CELERY_ACCEPT_CONTENT = ['msgpack', 'json'] # 任务接受格式类型 + +CELERY_QUEUES = { + Queue('foo', routing_key='task.#'), + Queue('feed_task', routing_key='*.feed'), +} +CELERY_DEFAULT_QUEUE = 'foo' + +CELERY_DEFAULT_EXCHANGE = 'tasks' + +CELERY_DEFAULT_EXCHANGE_TYPE = 'topic' + +CELERY_DEFAULT_ROUTING_KEY = 'task.foooooo' + +CELERY_ROUTES = { + 'projb.tasks.mul': { + 'queue': 'feed_task', + 'routing_key': 'mul.feed', + }, +} diff --git a/celeries/projb/tasks.py b/celeries/projb/tasks.py new file mode 100644 index 0000000..9b7d2a7 --- /dev/null +++ b/celeries/projb/tasks.py @@ -0,0 +1,14 @@ +# coding=utf-8 +from __future__ import absolute_import + +from .celery import app + + +@app.task +def add(x, y): + return x + y + + +@app.task +def mul(x, y): + return x * y diff --git a/celeries/projc/__init__.py b/celeries/projc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/celeries/projc/celery.py b/celeries/projc/celery.py new file mode 100644 index 0000000..125ef95 --- /dev/null +++ b/celeries/projc/celery.py @@ -0,0 +1,10 @@ +# coding=utf-8 +from __future__ import absolute_import +from celery import Celery + +app = Celery('projc', include=['projc.tasks']) +app.config_from_object('projc.celeryconfig') + + +if __name__ == '__main__': + app.start() diff --git a/celeries/projc/celeryconfig.py b/celeries/projc/celeryconfig.py new file mode 100644 index 0000000..1cf48ff --- /dev/null +++ b/celeries/projc/celeryconfig.py @@ -0,0 +1,37 @@ +# coding=utf-8 +from kombu import Queue + +BROKER_URL = 'amqp://localhost' # RabbitMQ 作为消息代理 +CELERY_RESULT_BACKEND = 'redis://localhost:6379/0' # Redis 作为结果存储 +CELERY_TASK_SERIALIZER = 'msgpack' +# 任务序列化和反序列化格式为 msgpack, 别忘了安装 msgpack-python +CELERY_RESULT_SERIALIZER = 'json' # 结果存储序列化格式为 json +CELERY_ACCEPT_CONTENT = ['msgpack', 'json'] # 任务接受格式类型 + +CELERY_QUEUES = { + Queue('foo', routing_key='task.#'), + Queue('feed_task', routing_key='*.feed'), +} +CELERY_DEFAULT_QUEUE = 'foo' + +CELERY_DEFAULT_EXCHANGE = 'tasks' + +CELERY_DEFAULT_EXCHANGE_TYPE = 'topic' + +CELERY_DEFAULT_ROUTING_KEY = 'task.foooooo' + +CELERY_ROUTES = { + 'projb.tasks.mul': { + 'queue': 'feed_task', + 'routing_key': 'mul.feed', + }, + +} + +CELERYBEAT_SCHEDULE = { + 'mul-every-30-seconds': { + 'task': 'projc.tasks.mul', + 'schedule': 30.0, + 'args': (2, 2), + } +} diff --git a/celeries/projc/tasks.py b/celeries/projc/tasks.py new file mode 100644 index 0000000..9b7d2a7 --- /dev/null +++ b/celeries/projc/tasks.py @@ -0,0 +1,14 @@ +# coding=utf-8 +from __future__ import absolute_import + +from .celery import app + + +@app.task +def add(x, y): + return x + y + + +@app.task +def mul(x, y): + return x * y diff --git a/celery_learn.md b/celery_learn.md new file mode 100644 index 0000000..f8c8b84 --- /dev/null +++ b/celery_learn.md @@ -0,0 +1,309 @@ +# Celery 的简单使用 + +标签: python celery + +--- +***代码[在这里](https://github.com/lambdaplus/python/tree/master/celeries)*** + + +Celery 是一个简单、灵活并且可靠的处理大量消息的分发系统。并且是自带电池的,本身提供了维护和操作这个系统的工具。 + +Celery 专注于实时处理的任务队列,并且支持任务调度。 +优点: +1. 简单 +2. 高可用 +3. 快速 +4. 灵活 + +## Celery 架构 + ++ Celery Beat: 任务调度器 ++ Celery Worker: 消费者 ++ Broker: 消息中间件,常用的是 RabbitMQ 和 Redis ++ Producer:任务生产者 ++ Result Backend:用于结果保存。 + +## Celery 序列化 + + + +## 一个简单的简单例子 +项目目录为 +```bash +celeries/proj/ +├── celeryconfig.py +├── celery.py +├── __init__.py +└── tasks.py +``` +--- +主程序 celery.py +```python +from __future__ import absolute_import +from celery import Celery + +app = Celery('proj', include=['proj.tasks'], +app.config_from_object('proj.celeryconfig') + + +if __name__ == "main": + app.start() +``` + + +任务函数 tasks.py +```python +# coding=utf-8 +from __future__ import absolute_import + +from .celery import app + + +@app.task +def add(x, y): + return x + y + + +@app.task +def mul(x, y): + return x * y +``` +接下来是 配置文件 celeryconfig.py +```python +# coding=utf-8 +BROKER_URL = 'amqp://localhost' # RabbitMQ 作为消息代理 +CELERY_RESULT_BACKEND = 'redis://localhost:6379/0' # Redis 作为结果存储 +CELERY_TASK_SERIALIZER = 'msgpack' +# 任务序列化和反序列化格式为 msgpack, 别忘了安装 msgpack-python +CELERY_RESULT_SERIALIZER = 'json' # 结果存储序列化格式为 json +CELERY_ACCEPT_CONTENT = ['msgpack', 'json'] # 任务接受格式类型 +``` +因为没有任务调度,所以直接启动消费者就行了。在启动之前,要先去安装 RabbitMQ 和 Redis, 并启动。 + +现在启动我们的消费者函数, 命令行直接启动: + + > cd celeries + > celery -A celeries worker -l info + +看到下面的提示信息,表示成功启动 +```python + -------------- celery@mouse-pc v4.0.2 (latentcall) +---- **** ----- +--- * *** * -- Linux-4.9.15-1-MANJARO-x86_64-with-glibc2.2.5 2017-03-22 21:53:05 +-- * - **** --- +- ** ---------- [config] +- ** ---------- .> app: celeries:0x7f9737da7a58 +- ** ---------- .> transport: amqp://guest:**@localhost:5672// +- ** ---------- .> results: redis://localhost/ +- *** --- * --- .> concurrency: 2 (prefork) +-- ******* ---- .> task events: OFF (enable -E to monitor tasks in this worker) +--- ***** ----- + -------------- [queues] + .> celery exchange=celery(direct) key=celery + + +[tasks] + . celeries.tasks.add + . celeries.tasks.mul + . celeries.tasks.xsum + +[2017-03-22 21:53:06,011: INFO/MainProcess] Connected to amqp://guest:**@127.0.0.1:5672// +[2017-03-22 21:53:06,034: INFO/MainProcess] mingle: searching for neighbors +[2017-03-22 21:53:07,088: INFO/MainProcess] mingle: all alone +[2017-03-22 21:53:07,115: INFO/MainProcess] celery@mouse-pc ready. +``` +打开 IPython 测试一下我们的几个函数。 +```python +~ ▶︎︎ ipython +Python 3.6.0 |Anaconda 4.3.1 (64-bit)| (default, Dec 23 2016, 12:22:00) +Type "copyright", "credits" or "license" for more information. + + +In [1]: from celeries.tasks import add, mul, xsum + +In [2]: add.delay(1, 9) +Out[2]: +In [3]: r = mul.delay(2, 4) + +In [4]: r.status +Out[4]: 'SUCCESS' + +In [5]: r.result +Out[5]: 8 + +In [6]: r.successful +Out[6]: > + +In [7]: r.backend +Out[7]: # 结果存储在 redis 里 + +``` +delay() 是 apply_async() 的快捷方式。你也直接调用 apply_async() : +```python +In [24]: r = mul.apply_async((2, 4)) + +In [25]: r.result +Out[25]: 8 +``` +delay() & apply_async 返回的都是 AsyncResult 实例,可用于查看任务的执行状态,但首先你要配置好 result backend. +此时,在worker终端上可以看到,任务信息和结果 +```bash +[2017-03-22 22:05:13,689: INFO/MainProcess] Received task: celeries.tasks.add[38022eec-2d3d-4ee0-8c7e-367ef92b5f1f] +[2017-03-22 22:05:14,765: INFO/PoolWorker-2] Task celeries.tasks.add[38022eec-2d3d-4ee0-8c7e-367ef92b5f1f] succeeded in 0.007736653999018017s: 10 +[2017-03-22 22:08:36,378: INFO/MainProcess] Received task: celeries.tasks.mul[17af4e48-736d-44c9-a8be-a50a35bbc435] +[2017-03-22 22:08:37,010: INFO/PoolWorker-2] Task celeries.tasks.mul[17af4e48-736d-44c9-a8be-a50a35bbc435] succeeded in 0.011531784999533556s: 8 +``` +仔细看,每个任务都有一个 task_id。我们可以通过 task_id 获得任务的结果。 + +取 add 任务的 id: +```bash +task_id = '38022eec-2d3d-4ee0-8c7e-367ef92b5f1f' +In [8]: task_id = '38022eec-2d3d-4ee0-8c7e-367ef92b5f1f' + +In [9]: add.AsyncResult(task_id).get() +Out[9]: 10 +``` +关联任务 + + In [2]: m = mul.apply_async((2, 2), link=mul.s(3)) + +在 Worker 终端里会看到两个值,关联之前和之后的。 +``` +[2017-03-23 13:27:13,045: INFO/MainProcess] Received task: proj.tasks.mul[40492357-44bb-41e4-979f-6eb197107a5b] +[2017-03-23 13:27:13,731: INFO/PoolWorker-2] Task proj.tasks.mul[40492357-44bb-41e4-979f-6eb197107a5b] succeeded in 0.0023383530005958164s: 4 +[2017-03-23 13:27:13,732: INFO/MainProcess] Received task: proj.tasks.mul[b01be1b8-f957-48b2-9d72-8187af6ac161] +[2017-03-23 13:27:13,734: INFO/PoolWorker-2] Task proj.tasks.mul[b01be1b8-f957-48b2-9d72-8187af6ac161] succeeded in 0.0006868359996587969s: 12 +``` + + +## 指定队列 +在 celeries 目录下新建一个目录 projb, 代码使用 proj 中的。 +```bash +celeries/projb +├── celeryconfig.py +├── celery.py +├── __init__.py +└── tasks.py +``` +在 celeryconfig.py 添加些配置: +``` +# coding=utf-8 +from kombu import Queue + +BROKER_URL = 'amqp://localhost' # RabbitMQ 作为消息代理 +CELERY_RESULT_BACKEND = 'redis://localhost:6379/0' # Redis 作为结果存储 +CELERY_TASK_SERIALIZER = 'msgpack' +# 任务序列化和反序列化格式为 msgpack, 别忘了安装 msgpack-python +CELERY_RESULT_SERIALIZER = 'json' # 结果存储序列化格式为 json +CELERY_ACCEPT_CONTENT = ['msgpack', 'json'] # 任务接受格式类型 + +CELERY_QUEUES = { + Queue('foo', routing_key='task.#'), # 路由键以 task. 开头的消息进入此队列 + Queue('feed_task', routing_key='*.feed'), # 路由键以 .feed 结尾的消息进入此队列 +} +CELERY_DEFAULT_QUEUE = 'foo' # 默认队列 + +CELERY_DEFAULT_EXCHANGE = 'tasks' # 默认交换机 + +CELERY_DEFAULT_EXCHANGE_TYPE = 'topic' # 默认交换机类型 topic + +CELERY_DEFAULT_ROUTING_KEY = 'task.foooooooo' # 默认交换机路由键, task. 后的值不影响 + +CELERY_ROUTES = { + 'projb.tasks.mul': { + 'queue': 'feed_task', # 消息全都进入 feed_task 队列 + 'routing_key': 'mul.feed', + }, +} +``` +然后,我们以指定队列的方式启动: + + > celery -A projb worker -Q foo,feed_task -l info + +tasks.py 中的 mul 函数只会通过队列 feed_task 被执行。add 函数通过默认队列 foo 执行。 + ```python +In [84]: from projb.tasks import mul, add + +In [85]: r = add.delay(3, 3) + +In [86]: r.result +Out[86]: 6 + +In [87]: res = mul.delay(3, 3) + +In [88]: res.result +Out[88]: 9 +``` +不过,我们可以使用 apply_async() 函数来指定队列。 +```python +In [90]: r = add.apply_async((3, 3), queue='feed_task', routing_key='mul.feed') + +In [91]: r.result +Out[91]: 6 + +In [92]: res = mul.apply_async((3, 3), queue='foo', routing_key='task.foooooo') + +In [93]: res.result +Out[93]: 9 +``` + +## 任务调度 +依法炮制,基于 projb 的代码,创建目录 projc,在 proc/celeryconfig.py 中添加如下配置。 +``` +CELERYBEAT_SCHEDULE = { + 'mul-every-30-seconds': { + 'task': 'projc.tasks.mul', + 'schedule': 30.0, + 'args': (2, 2), + } +} +``` +执行 + + > celery -B -A projc worker -l info + +就可以在终端看到每 30s 执行一次任务。 +``` +[2017-03-23 12:23:13,920: INFO/Beat] Scheduler: Sending due task mul-every-30-seconds (projc.tasks.mul) +[2017-03-23 12:23:13,923: INFO/MainProcess] Received task: projc.tasks.mul[9c414257-d627-4c36-a9d8-9daed7e295c0] +[2017-03-23 12:23:15,177: INFO/PoolWorker-3] Task projc.tasks.mul[9c414257-d627-4c36-a9d8-9daed7e295c0] succeeded in 0.0010301589991286164s: 4 +``` + +## 任务绑定、日志记录和错误重试 + +任务绑定、记录日志和重试是 Celery 3 个常用的高级功能。接下来,修改 proj 的 tasks.py 文件。添加一个 div 函数。 +``` +@app.task(bind=True) +def div(self, x, y): + logger.info( + ''' + Executing task : {0.id} + task.args : {0.args!r} + task.kwargs : {0.kwargs!r} + '''.format(self.request) + ) + try: + res = x / y + except ZeroDivisionError as e: + raise self.retry(exc=e, countdown=3, max_retries=3) + else: + return res +``` +在 Ipython 调用: + + In [3]: d = div.delay(2, 1) + +在 worker 中可以看到 +``` +[2017-03-23 14:57:17,361: INFO/PoolWorker-2] proj.tasks.div[68ef1584-16ac-4236-9858-b00842891bbc]: + Executing task : 68ef1584-16ac-4236-9858-b00842891bbc + task.args : [2, 1] + task.kwargs : {} + +[2017-03-23 14:57:17,369: INFO/PoolWorker-2] Task proj.tasks.div[68ef1584-16ac-4236-9858-b00842891bbc] succeeded in 0.007741746998362942s: 2.0 +``` +换成可以引起异常的参数: + + In [4]: d = div.delay(2, 0) + +可以看到,在 worker 中每 3s 重试一次,总共重复三次(执行了 4 次),然后抛出异常! \ No newline at end of file diff --git a/decorate/decorater_of_class.py b/decorate/decorater_of_class.py new file mode 100644 index 0000000..3ab6999 --- /dev/null +++ b/decorate/decorater_of_class.py @@ -0,0 +1,16 @@ +# coding=utf-8 +class Log(): + + def __init__(self, file="info.log"): + self.file = file + + def __call__(self, func): + log = func.__name__ + " was called" + print(log) + with open(self.file, 'a') as f: + f.write(log + '\n') + + +@Log() +def hello(): + print('Hello World!') diff --git a/decorate/decorater_with_para.py b/decorate/decorater_with_para.py new file mode 100644 index 0000000..59878d1 --- /dev/null +++ b/decorate/decorater_with_para.py @@ -0,0 +1,29 @@ +# coding=utf-8 + +from functools import wraps + + +def logs(file="info.log"): + def decorate(func): + @wraps(func) + def wrapper(*args, **kw): + log = func.__name__ + " was called" + print(log) + with open(file, 'a') as f: + f.write(log + '\n') + return wrapper + return decorate + + +@logs() +def hello(): + print('Hello World!') + +hello() + + +@logs(file='info2.log') +def hello2(): + print('Hello World!') + +hello2() diff --git a/decorate/decorater_without_para.py b/decorate/decorater_without_para.py new file mode 100644 index 0000000..f8c286a --- /dev/null +++ b/decorate/decorater_without_para.py @@ -0,0 +1,19 @@ +# coding=utf-8 + +from functools import wraps + + +def log(func): + @wraps(func) + def wrapper(*args, **kw): + print("I'm a log ^*^") + result = func(*args, **kw) + return result + return wrapper + + +@log +def hello(): + print('Hello everybody') + +hello() diff --git a/host_trans_anti.py b/host_trans_anti.py new file mode 100644 index 0000000..9e937f2 --- /dev/null +++ b/host_trans_anti.py @@ -0,0 +1,30 @@ +#UTF-8 + +#把anti-AD的dsnmasq转换成MelinClash的hosts格式 + +import wget +from datetime import date +today = date.today() + +def host_trans(url): + file = wget.download(url, out='/home/lambda/Documents/adblock/anti_ad_'+str(today)+'.conf') + with open(file, 'r') as f: + new_f = open('/home/lambda/Documents/adblock/anti_ad_'+str(today)+'.yaml', 'w') + new_f.write('hosts:\n') + new_f.write(' router.asus.com: 192.168.50.1\n') + new_f.write(' services.googleapis.cn: 74.125.193.94\n') + for lines in f: + if '#' not in lines and not lines == "\n": + # 获取网址 + tail = lines[9:].strip() + # tail[:-1] 去掉字符串尾部的 / + new_lines = " " + tail[:-1] + ": " + '127.0.0.1' + new_f.write(new_lines+'\n') + else: + pass + new_f.close() + print("anti-ad.yaml文件保存在文档/adblock文件夹下") + +if __name__ == '__main__': + url = 'https://anti-ad.net/anti-ad-for-dnsmasq.conf' + host_trans(url) \ No newline at end of file diff --git "a/hosts\350\275\254\346\215\242\345\244\247\345\234\243\345\207\200\345\214\226.py" "b/hosts\350\275\254\346\215\242\345\244\247\345\234\243\345\207\200\345\214\226.py" new file mode 100644 index 0000000..6a69229 --- /dev/null +++ "b/hosts\350\275\254\346\215\242\345\244\247\345\234\243\345\207\200\345\214\226.py" @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[32]: + + +with open('/home/lambda/Downloads/host.html', 'r') as f: + new_f = open('/home/lambda/Downloads/ad.yaml', 'w') + new_f.write('hosts:\n') + new_f.write(' router.asus.com: 192.168.50.1\n') + new_f.write(' services.googleapis.cn: 74.125.193.94\n') + for lines in f: + if '#' not in lines and not lines == "\n": + tail = lines[9:].strip() + head = lines[:9].strip() + new_lines = " " + tail + ": " + head + new_f.write(new_lines+'\n') + else: + pass + new_f.close() + + +# In[ ]: + + + + diff --git "a/hosts\350\275\254\346\215\242\345\244\247\345\234\243\345\207\200\345\214\226\345\222\214anti-AD.ipynb" "b/hosts\350\275\254\346\215\242\345\244\247\345\234\243\345\207\200\345\214\226\345\222\214anti-AD.ipynb" new file mode 100644 index 0000000..c0b98b9 --- /dev/null +++ "b/hosts\350\275\254\346\215\242\345\244\247\345\234\243\345\207\200\345\214\226\345\222\214anti-AD.ipynb" @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 大圣净化的转换代码" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "with open('/home/lambda/Downloads/host.html', 'r') as f:\n", + " new_f = open('/home/lambda/Downloads/ad.yaml', 'w')\n", + " new_f.write('hosts:\\n')\n", + " new_f.write(' router.asus.com: 192.168.50.1\\n')\n", + " new_f.write(' services.googleapis.cn: 74.125.193.94\\')\n", + " for lines in f:\n", + " if '#' not in lines and not lines == \"\\n\":\n", + " tail = lines[9:].strip()\n", + " head = lines[:9].strip()\n", + " new_lines = \" \" + tail + \": \" + head\n", + " new_f.write(new_lines+'\\n')\n", + " else:\n", + " pass\n", + " new_f.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## anti-AD的转换代码" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('/home/lambda/Downloads/adblock-for-dnsmasq.con.html', 'r') as f:\n", + " new_f = open('/home/lambda/Downloads/anti_ad.yaml', 'w')\n", + " new_f.write('hosts:\\n')\n", + " new_f.write(' router.asus.com: 192.168.50.1\\n')\n", + " new_f.write(' services.googleapis.cn: 74.125.193.94\\n')\n", + " for lines in f:\n", + " if '#' not in lines and not lines == \"\\n\":\n", + " # 获取网址\n", + " tail = lines[9:].strip()\n", + " # tail[:-1] 去掉字符串尾部的 /\n", + " new_lines = \" \" + tail[:-1] + \": \" + '127.0.0.1'\n", + " new_f.write(new_lines+'\\n')\n", + " else:\n", + " pass\n", + " new_f.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 合体" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mf2\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mf3\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mf2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mf1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mf3\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mf1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.8/codecs.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, input, final)\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;31m# decode input (taking the buffer into account)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 321\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 322\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconsumed\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfinal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 323\u001b[0m \u001b[0;31m# keep undecoded input until the next call\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mconsumed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "with open('/home/lambda/Downloads/anti_dasheng.yaml', 'w') as f1, open('/home/lambda/Downloads/ad.yaml', 'r') as f2, open('/home/lambda/Downloads/anti_ad.yaml', 'r') as f3:\n", + " while f2 or f3:\n", + " if f2:\n", + " f1.write(f2.readline())\n", + " if f3:\n", + " f1.write(f3.readline())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/husheng_action.py b/husheng_action.py new file mode 100644 index 0000000..f57e5d1 --- /dev/null +++ b/husheng_action.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Apr 18 17:34:03 2021 + +@author: lambda +""" +import aiohttp +import asyncio +import re +import os +import sys +import json +import random +import pandas as pd +import datetime +import time +import requests as request +from lxml import etree + +user_agent = [ + "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", + "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", + "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", + "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", + "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", + "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", + "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", + "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", + "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", + "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", + "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", + "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", + "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", + "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", + "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", + "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", + "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", + "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", + "UCWEB7.0.2.37/28/999", + "NOKIA5700/ UCWEB7.0.2.37/28/999", + "Openwave/ UCWEB7.0.2.37/28/999", + "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999", + # iPhone 6: + "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25", +] + +start = time.time() + +host = 'http://data.eastmoney.com/hsgtcg/list.html?DateType=DateType=%27jd%27' +res = request.get(host) +xml = etree.HTML(res.text) +result = xml.xpath('/html/body/div[1]/div[8]/div[2]/div[2]/div[1]/div[1]/div/span/text()')[0] +today = result[1:11] +print(f'今天获取的数据是: {today}') + +fname = str(today)+".xlsx" +fname1 = "PPOS_POTE_"+fname +fname2 = "PPOS_POTE_SZ_"+fname + +#file_path = './hushengang' +#if not os.path.exists(file_path): +# os.mkdir(file_path) + +#if os.path.exists(fname): +# print('文件已存在,明天再来吧') +# sys.exit() + +heads = {'HdDate', 'SCode', 'SName', 'NewPrice', 'ShareSZ_Chg_One', 'ShareSZ_Chg_Rate_One', 'LTZB_One', 'ZZB_One'} +rows = [] + +# 获取网页信息 +async def fetch(session, url): + headers = {'User-Agent': random.choice(user_agent)} + async with session.get(url, headers=headers) as response: + return await response.text(encoding='utf-8') + +# 解析网页 +async def parser(html): + pat = re.compile('data:(.*)}', re.S) # 使用正则 + result = re.search(pat, html).group(1) + data = json.loads(result) + if len(data) == 0: + print('日期有错误,看看是不是日期不对。。。。。。') + sys.exit() + for d in data: + row = {key: value for key, value in d.items() if key in heads} + rows.append(row) + +# 下载网页 +async def download(url): + async with aiohttp.ClientSession() as session: + html = await fetch(session, url) + await parser(html) + +#urls = [f'http://dcfm.eastmoney.com/em_mutisvcexpandinterface/api/js/get?callback=jQuery112305322211230994847_1618827285261&st=ShareSZ_Chg_One&sr=-1&ps=50&p='+str(p)+'&type=HSGT20_GGTJ_SUM&token=894050c76af8597a853f5b408b759f5d&js=%7B%22data%22%3A(x)%2C%22pages%22%3A(tp)%2C%22font%22%3A(font)%7D&filter=(DateType%3D%27jd%27)(HdDate%3D%27'+str(today)+'%27)' for p in range(1, 31)] +urls = [f'http://dcfm.eastmoney.com/EM_MutiSvcExpandInterface/api/js/get?type=HSGT20_GGTJ_SUM&token=894050c76af8597a853f5b408b759f5d&st=ShareSZ_Chg_One&sr=-1&p='+str(p)+'&ps=50&js=var%20mXyeKPjW={pages:(tp),data:(x)}&filter=(DateType=%27jd%27%20and%20HdDate=%27'+str(today)+'%27)&rt=53931781' for p in range(1, 31)] + +# 利用asyncio模块进行异步IO处理 +async def main(): + await asyncio.gather(*[download(url) for url in urls]) + +asyncio.run(main()) +# 将rows转化为pandas中的DataFrame +df = pd.DataFrame(rows) +df.columns = ['日期', '代码', '名称', '最新股价' , '市值', '市值增幅', '占流通股比', '占总股比'] +# 从大到小排序 +df.sort_values(by='市值', ascending=False) +try: + df.to_excel(fname) # 保存成Excel文件 +except Exception as e: + print("请关闭文件后再试", e) + +df1 = df.nlargest(20, '占总股比') +df2 = df.nlargest(20, "占流通股比") +df3 = df.nlargest(10, "市值") +# 占总股比前20和流通股比前20的交集 +df1_df2 = pd.merge(df1, df2, on=list(df.columns), how='inner') +# 三者的交集 +df1_df2_df3 = pd.merge(df1_df2, df3, on=list(df.columns), how='inner') +try: + df1_df2.to_excel(fname1) +except Exception as e: + print("请关闭文件后再试", e) + +try: + df1_df2_df3.to_excel(fname2) +except Exception as e: + print("请关闭文件后再试", e) + +stop = time.time() +print(f"使用aiohttp共耗时{stop-start} S") diff --git a/hushengangtong.py b/hushengangtong.py new file mode 100644 index 0000000..9b841a5 --- /dev/null +++ b/hushengangtong.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Apr 18 17:34:03 2021 + +@author: lambda +""" +import aiohttp +import asyncio +import re +import os +import sys +import json +import random +import pandas as pd +import datetime +import time +import requests as request +from lxml import etree + +user_agent = [ + "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", + "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", + "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", + "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", + "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", + "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", + "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", + "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", + "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", + "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", + "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", + "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", + "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", + "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", + "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", + "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", + "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", + "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", + "UCWEB7.0.2.37/28/999", + "NOKIA5700/ UCWEB7.0.2.37/28/999", + "Openwave/ UCWEB7.0.2.37/28/999", + "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999", + # iPhone 6: + "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25", +] + +start = time.time() + +host = 'http://data.eastmoney.com/hsgtcg/list.html?DateType=DateType=%27jd%27' +res = request.get(host) +xml = etree.HTML(res.text) +result = xml.xpath('/html/body/div[1]/div[8]/div[2]/div[2]/div[1]/div[1]/div/span/text()')[0] +today = result[1:11] +print(f'今天获取的数据是: {today}') + +fname = str(today)+".xlsx" +fname1 = "PPOS_POTE_"+fname +fname2 = "PPOS_POTE_SZ_"+fname + +file_path = './' +#if not os.path.exists(file_path): +# os.mkdir(file_path) +#os.chdir(file_path) + +if os.path.exists(fname): + print('文件已存在,明天再来吧') + sys.exit() + +heads = {'HdDate', 'SCode', 'SName', 'NewPrice', 'ShareSZ_Chg_One', 'ShareSZ_Chg_Rate_One', 'LTZB_One', 'ZZB_One'} +rows = [] + +# 获取网页信息 +async def fetch(session, url): + headers = {'User-Agent': random.choice(user_agent)} + async with session.get(url, headers=headers) as response: + return await response.text(encoding='utf-8') + +# 解析网页 +async def parser(html): + pat = re.compile('data:(.*)}', re.S) # 使用正则 + result = re.search(pat, html).group(1) + data = json.loads(result) + if len(data) == 0: + print('日期有错误,看看是不是日期不对。。。。。。') + sys.exit() + for d in data: + row = {key: value for key, value in d.items() if key in heads} + rows.append(row) + +# 下载网页 +async def download(url): + async with aiohttp.ClientSession() as session: + html = await fetch(session, url) + await parser(html) + +#urls = [f'http://dcfm.eastmoney.com/em_mutisvcexpandinterface/api/js/get?callback=jQuery112305322211230994847_1618827285261&st=ShareSZ_Chg_One&sr=-1&ps=50&p='+str(p)+'&type=HSGT20_GGTJ_SUM&token=894050c76af8597a853f5b408b759f5d&js=%7B%22data%22%3A(x)%2C%22pages%22%3A(tp)%2C%22font%22%3A(font)%7D&filter=(DateType%3D%27jd%27)(HdDate%3D%27'+str(today)+'%27)' for p in range(1, 31)] +urls = [f'http://dcfm.eastmoney.com/EM_MutiSvcExpandInterface/api/js/get?type=HSGT20_GGTJ_SUM&token=894050c76af8597a853f5b408b759f5d&st=ShareSZ_Chg_One&sr=-1&p='+str(p)+'&ps=50&js=var%20mXyeKPjW={pages:(tp),data:(x)}&filter=(DateType=%27jd%27%20and%20HdDate=%27'+str(today)+'%27)&rt=53931781' for p in range(1, 31)] + +# 利用asyncio模块进行异步IO处理 +async def main(): + await asyncio.gather(*[download(url) for url in urls]) + +asyncio.run(main()) +# 将rows转化为pandas中的DataFrame +df = pd.DataFrame(rows) +df.columns = ['日期', '代码', '名称', '最新股价' , '市值', '市值增幅', '占流通股比', '占总股比'] +# 从大到小排序 +df.sort_values(by='市值', ascending=False) +try: + df.to_excel(fname) # 保存成Excel文件 +except Exception as e: + print("请关闭文件后再试", e) + +df1 = df.nlargest(20, '占总股比') +df2 = df.nlargest(20, "占流通股比") +df3 = df.nlargest(10, "市值") +# 占总股比前20和流通股比前20的交集 +df1_df2 = pd.merge(df1, df2, on=list(df.columns), how='inner') +# 三者的交集 +df1_df2_df3 = pd.merge(df1_df2, df3, on=list(df.columns), how='inner') +try: + df1_df2.to_excel(fname1) +except Exception as e: + print("请关闭文件后再试", e) + +try: + df1_df2_df3.to_excel(fname2) +except Exception as e: + print("请关闭文件后再试", e) + +stop = time.time() +print(f"使用aiohttp共耗时{stop-start} S") diff --git a/rabbitmq/emit_logs.py b/rabbitmq/emit_logs.py new file mode 100644 index 0000000..d0181bd --- /dev/null +++ b/rabbitmq/emit_logs.py @@ -0,0 +1,17 @@ +# coding: utf-8 +import pika +import sys + +connection = pika.BlockingConnection( + pika.ConnectionParameters(host='localhost')) +channel = connection.channel() + +channel.exchange_declare(exchange='logs', type='fanout') +messages = ''.join(sys.argv[1:]) or 'info: Hello World!' + +channel.basic_publish(exchange='logs', + routing_key='', + body=messages) + +print("[x] Send {}".format(messages)) +connection.close() diff --git a/rabbitmq/emit_logs_direct.py b/rabbitmq/emit_logs_direct.py new file mode 100644 index 0000000..ad65ed5 --- /dev/null +++ b/rabbitmq/emit_logs_direct.py @@ -0,0 +1,19 @@ +# coding=utf-8 +import pika +import sys + +connection = pika.BlockingConnection( + pika.ConnectionParameters(host='localhost')) +channel = connection.channel() + +channel.exchange_declare(exchange='direct_logs', + type='direct') + +severity = sys.argv[1] if len(sys.argv) > 2 else 'info' +messages = ''.join(sys.argv[2:]) or "Hello World!" + +channel.basic_publish(exchange="direct_logs", + routing_key=severity, + body=messages) +print('[x] Send {}:{}'.format(severity, messages)) +connection.close() diff --git a/rabbitmq/emit_logs_topic.py b/rabbitmq/emit_logs_topic.py new file mode 100644 index 0000000..9cf1101 --- /dev/null +++ b/rabbitmq/emit_logs_topic.py @@ -0,0 +1,20 @@ +# coding=utf-8 +import pika +import sys + +connection = pika.BlockingConnection( + pika.ConnectionParameters(host='localhost')) +channel = connection.channel() + +channel.exchange_declare(exchange='topic_logs', + type='topic') + +routing_key = sys.argv[1] if len(sys.argv) > 2 else 'anonymous.info' +messages = ''.join(sys.argv[2:]) or "Hello World!" + +channel.basic_publish(exchange='topic_logs', + routing_key=routing_key, + body=messages) + +print("[x] Send {}:{}".format(routing_key, messages)) +connection.close() diff --git a/rabbitmq/kombu_emit_logs_topic.py b/rabbitmq/kombu_emit_logs_topic.py new file mode 100644 index 0000000..ef21777 --- /dev/null +++ b/rabbitmq/kombu_emit_logs_topic.py @@ -0,0 +1,17 @@ +# coding=utf-8 +import sys + +from kombu import Connection, Producer, Queue, Exchange + +logs_exchange = Exchange('logs', 'topic', durable=True) + +URL = 'amqp://localhost' + +kombu_learn = sys.argv[1] if len(sys.argv) > 2 else 'anonymous.info' +messages = ''.join(sys.argv[2:]) or "Hello World!" + +with Connection(URL) as conn: + producer = Producer(conn) + producer.publish(messages, exchange=logs_exchange, + routing_key=kombu_learn, + serializer='json') diff --git a/rabbitmq/kombu_receive_logs_topic.py b/rabbitmq/kombu_receive_logs_topic.py new file mode 100644 index 0000000..a276bea --- /dev/null +++ b/rabbitmq/kombu_receive_logs_topic.py @@ -0,0 +1,41 @@ +# coding=utf-8 +import sys + +from kombu import Exchange, Queue, Connection, Consumer +from kombu.async import Hub + + +logs_exchange = Exchange(name='logs', type="topic", durable=True) + +URL = 'amqp://localhost' +hub = Hub() + +binding_keys = sys.argv[1:] +if not binding_keys: + sys.stderr.write("Usage: {} [binding_keys]...\n".format(sys.argv[0])) + sys.exit() + +tasks_queues = [Queue(binding_key, + logs_exchange, + exclusive=True, + routing_key=binding_key) + for binding_key in binding_keys] + +print("[*] Waitting for logs. To exit press Ctrl+C") + + +def on_messages(body, messages): + print(""" + Body: {0} + Properties: {1} + DeliveryInfo: {2} + """.format(body, messages.properties, messages.delivery_info) + ) + +with Connection(URL) as conn: + conn.register_with_event_loop(hub) + with Consumer(conn, tasks_queues, callbacks=[on_messages]): + try: + hub.run_forever() + except KeyboardInterrupt: + exit() diff --git a/rabbitmq/kombu_receive_logs_topic_2.py b/rabbitmq/kombu_receive_logs_topic_2.py new file mode 100644 index 0000000..e033266 --- /dev/null +++ b/rabbitmq/kombu_receive_logs_topic_2.py @@ -0,0 +1,32 @@ +# coding=utf-8 +import sys + +from kombu import Exchange, Queue, Connection +from kombu.mixins import ConsumerMixin + + +class Worker(ConsumerMixin): + logs_exchange = Exchange(name='logs', type="topic", durable=True) + + def __init__(self, connection): + self.connection = connection + + binding_keys = sys.argv[1:] + if not binding_keys: + sys.stderr.write('Usage: {} [binding_keys] ...\n'.format(sys.argv[0])) + + def get_consumers(self, Consumer, channel): + return [Consumer([Queue(binding_key, + self.logs_exchange, + exclusive=True, + routing_key=binding_key) + for binding_key in self.binding_keys], + callbacks=[self.on_messages])] + + def on_messages(self, body, messages): + print('Body: {}'.format(body)) + + +URL = 'amqp://localhost' +with Connection(URL) as connection: + Worker(connection).run() diff --git a/rabbitmq/new_task.py b/rabbitmq/new_task.py new file mode 100644 index 0000000..d738cc9 --- /dev/null +++ b/rabbitmq/new_task.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +import pika +import sys + +connection = pika.BlockingConnection( + pika.ConnectionParameters(host='localhost')) +channel = connection.channel() +channel.queue_declare(queue='task_queue', durable=True) +# durable 持久化,即便rabbitMQ挂了也不会丢失信息 +messages = ''.join(sys.argv[1:]) or "Hello World!" +channel.basic_publish(exchange='', + routing_key='task_queue', + body=messages, + properties=pika.BasicProperties( + delivery_mode=2)) +print('[x] Send {}'.format(messages)) +connection.close() diff --git a/rabbitmq/receive.py b/rabbitmq/receive.py new file mode 100644 index 0000000..d512421 --- /dev/null +++ b/rabbitmq/receive.py @@ -0,0 +1,22 @@ +# coding=utf-8 +import pika + +connection = pika.BlockingConnection(pika.ConnectionParameters( + host="localhost")) +channel = connection.channel() +channel.queue_declare(queue='hello') + + +def callback(ch, method, properties, body): + print("[x] Receive {}".format(body)) + +channel.basic_consume(callback, + queue='hello', + no_ack=True) # 显示声明无消息确认回执 + +print('[*] Waitting for messages. To exit press Ctrl+C') + +try: + channel.start_consuming() +except KeyboardInterrupt: + channel.stop_consuming() diff --git a/rabbitmq/receive_logs.py b/rabbitmq/receive_logs.py new file mode 100644 index 0000000..de1bf52 --- /dev/null +++ b/rabbitmq/receive_logs.py @@ -0,0 +1,30 @@ +# coding: utf-8 +import pika + +connection = pika.BlockingConnection( + pika.ConnectionParameters(host='localhost')) +channel = connection.channel() + +channel.exchange_declare(exchange='logs', + type='fanout') + +result = channel.queue_declare(exclusive=True) +# disconnect consumer the queue is down + +queue_name = result.method.queue + +channel.queue_bind(exchange='logs', + queue=queue_name) + +print("[*] Waitting for logs. To exit press Ctrl+C") + + +def callback(ch, method, properties, body): + print("[x] {}".format(body)) + +channel.basic_consume(callback, queue=queue_name, no_ack=True) + +try: + channel.start_consuming() +except KeyboardInterrupt: + channel.stop_consuming() diff --git a/rabbitmq/receive_logs_direct.py b/rabbitmq/receive_logs_direct.py new file mode 100644 index 0000000..3b89a15 --- /dev/null +++ b/rabbitmq/receive_logs_direct.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +import pika +import sys + +connection = pika.BlockingConnection( + pika.ConnectionParameters(host='localhost')) +channel = connection.channel() + +channel.exchange_declare(exchange='direct_logs', + type='direct') + +result = channel.queue_declare(exclusive=True) +queue_name = result.method.queue + +severities = sys.argv[1:] +if not severities: + sys.stderr.write( + "Usage: {} [info] [warning] [error]\n".format(sys.argv[0])) + sys.exit(1) + +for severity in severities: + channel.queue_bind(exchange='direct_logs', + queue=queue_name, + routing_key=severity) + +print("[*] Waitting for logs. To exit press Ctrl+C") + + +def callback(ch, method, properties, body): + print("[x] {}:{}".format(method.routing_key, body)) + +channel.basic_consume(callback, queue=queue_name, no_ack=True) +try: + channel.start_consuming() +except KeyboardInterrupt: + channel.stop_consuming() diff --git a/rabbitmq/receive_logs_topic.py b/rabbitmq/receive_logs_topic.py new file mode 100644 index 0000000..f246cd0 --- /dev/null +++ b/rabbitmq/receive_logs_topic.py @@ -0,0 +1,37 @@ +# coding=utf-8 +import pika +import sys + +connection = pika.BlockingConnection( + pika.ConnectionParameters(host='localhost')) +channel = connection.channel() + +channel.exchange_declare(exchange='topic_logs', + type='topic') + +result = channel.queue_declare(exclusive=True) +queue_name = result.method.queue + +binding_keys = sys.argv[1:] +if not binding_keys: + sys.stderr.write("Usage: {} [binding_keys]...\n".format(sys.argv[0])) + sys.exit() + +for binding_key in binding_keys: + channel.queue_bind(queue=queue_name, + exchange='topic_logs', + routing_key=binding_key) + +print("[*] Waitting for logs. To exit press Ctrl+C") + + +def callback(ch, method, properties, body): + print("[x] {}:{}".format(method.routing_key, body)) + +channel.basic_consume(callback, + queue=queue_name, + no_ack=True) +try: + channel.start_consuming() +except KeyboardInterrupt: + channel.stop_consuming() diff --git a/rabbitmq/rpc_client.py b/rabbitmq/rpc_client.py new file mode 100644 index 0000000..ca45e27 --- /dev/null +++ b/rabbitmq/rpc_client.py @@ -0,0 +1,42 @@ +# coding=utf-8 +import pika +import uuid + + +class FibonacciRpcClient(object): + + def __init__(self): + self.connection = pika.BlockingConnection( + pika.ConnectionParameters(host='localhost')) + self.channel = self.connection.channel() + + result = self.channel.queue_declare(exclusive=True) + self.callback_queue = result.method.queue + self.channel.basic_consume( + self.on_response, no_ack=True, queue=self.callback_queue) + + def on_response(self, ch, method, props, body): + if self.corr_id == props.correlation_id: + self.response = body + + def call(self, n): + self.response = None + self.corr_id = str(uuid.uuid4()) + self.channel.basic_publish(exchange='', + routing_key='rpc_queue', + properties=pika.BasicProperties + ( + reply_to=self.callback_queue, + correlation_id=self.corr_id + ), + body=str(n) + ) + + while self.response is None: + self.connection.process_data_events() + return int(self.response) + +fibonacci_rpc = FibonacciRpcClient() +print("[x] Requesting fib(40)") +response = fibonacci_rpc.call(40) +print("[.] Got {}".format(response)) diff --git a/rabbitmq/rpc_client_example.py b/rabbitmq/rpc_client_example.py new file mode 100644 index 0000000..a39fad1 --- /dev/null +++ b/rabbitmq/rpc_client_example.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +import pika +import uuid + + +class FibonacciRpcClient(object): + + def __init__(self): + self.connection = pika.BlockingConnection(pika.ConnectionParameters( + host='localhost')) + + self.channel = self.connection.channel() + + result = self.channel.queue_declare(exclusive=True) + self.callback_queue = result.method.queue + + self.channel.basic_consume(self.on_response, no_ack=True, + queue=self.callback_queue) + + def on_response(self, ch, method, props, body): + if self.corr_id == props.correlation_id: + self.response = body + + def call(self, n): + self.response = None + self.corr_id = str(uuid.uuid4()) + self.channel.basic_publish(exchange='', + routing_key='rpc_queue', + properties=pika.BasicProperties( + reply_to=self.callback_queue, + correlation_id=self.corr_id, + ), + body=str(n)) + while self.response is None: + self.connection.process_data_events() + return int(self.response) + +fibonacci_rpc = FibonacciRpcClient() + +print(" [x] Requesting fib(30)") +response = fibonacci_rpc.call(30) +print(" [.] Got %r" % response) diff --git a/rabbitmq/rpc_server.py b/rabbitmq/rpc_server.py new file mode 100644 index 0000000..a193d90 --- /dev/null +++ b/rabbitmq/rpc_server.py @@ -0,0 +1,37 @@ +# coding=utf-8 +import pika + +connection = pika.BlockingConnection( + pika.ConnectionParameters(host='localhost')) +channel = connection.channel() + +channel.queue_declare(queue='rpc_queue') + + +def fib(n): + if n == 0: + return 0 + elif n == 1: + return 1 + else: + return fib(n - 1) + fib(n - 2) + + +def on_request(ch, method, props, body): + n = int(body) + print("[.] fib({})".format(n)) + response = fib(n) + + ch.basic_publish(exchange='', + routing_key=props.reply_to, + properties=pika.BasicProperties( + correlation_id=props.correlation_id), + body=str(response)) + + ch.basic_ack(delivery_tag=method.delivery_tag) + +channel.basic_qos(prefetch_count=1) +channel.basic_consume(on_request, queue='rpc_queue') + +print("[x] Awaiting RPC requests") +channel.start_consuming() diff --git a/rabbitmq/rpc_server_example.py b/rabbitmq/rpc_server_example.py new file mode 100644 index 0000000..ffc6432 --- /dev/null +++ b/rabbitmq/rpc_server_example.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +import pika + +connection = pika.BlockingConnection(pika.ConnectionParameters( + host='localhost')) + +channel = connection.channel() + +channel.queue_declare(queue='rpc_queue') + + +def fib(n): + if n == 0: + return 0 + elif n == 1: + return 1 + else: + return fib(n - 1) + fib(n - 2) + + +def on_request(ch, method, props, body): + n = int(body) + + print(" [.] fib(%s)" % n) + response = fib(n) + + ch.basic_publish(exchange='', + routing_key=props.reply_to, + properties=pika.BasicProperties( + correlation_id=props.correlation_id), + body=str(response)) + ch.basic_ack(delivery_tag=method.delivery_tag) + +channel.basic_qos(prefetch_count=1) +channel.basic_consume(on_request, queue='rpc_queue') + +print(" [x] Awaiting RPC requests") +channel.start_consuming() diff --git a/rabbitmq/send.py b/rabbitmq/send.py new file mode 100644 index 0000000..f9af0cd --- /dev/null +++ b/rabbitmq/send.py @@ -0,0 +1,20 @@ +# coding=utf-8 +import sys +import pika + +connection = pika.BlockingConnection(pika.ConnectionParameters( + host='localhost')) +channel = connection.channel() +channel.queue_declare(queue='hello') # 声明 名为 hello 的 queue + +if len(sys.argv) != 1: + body = sys.argv[1] +else: + body = "Hello World!" + +channel.basic_publish(exchange='', # 默认交换机 + routing_key='hello', # queue 需要指定路由键 + body=body) + +print("[x] Sent {}.".format(body)) +connection.close() diff --git a/rabbitmq/worker.py b/rabbitmq/worker.py new file mode 100644 index 0000000..288e7b6 --- /dev/null +++ b/rabbitmq/worker.py @@ -0,0 +1,24 @@ +# coding=utf-8 +import pika +import time + +connection = pika.BlockingConnection( + pika.ConnectionParameters(host='localhost')) +channel = connection.channel() +channel.queue_declare(queue='task_queue', durable=True) +print("[*] Waitting for messages. To exit press Ctrl+C") + + +def callback(ch, method, properties, body): + print("[x] Received {}".format(body)) + time.sleep(body.count(b'.')) # 模拟耗时操作 + print("[x] Done") + ch.basic_ack(delivery_tag=method.delivery_tag) + +channel.basic_qos(prefetch_count=1) # 负载均衡 +channel.basic_consume(callback, queue='task_queue') + +try: + channel.start_consuming() +except KeyboardInterrupt: + channel.stop_consuming() diff --git a/test.py b/test.py new file mode 100644 index 0000000..c8a5b14 --- /dev/null +++ b/test.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import logging +import re +import aiohttp +import asyncio +from bs4 import BeautifulSoup +from pymongo import MongoClient + + +class DouBanCrawl(): + + def __init__(self, url): + self.url = url + + async def fetch(self, url, headers): + res = await aiohttp.request('GET', url) + body = res.read() + return (await body) + + def infos_get(self, html, name=None): + soup = BeautifulSoup(html, 'lxml') + scores = soup.select('.rating_num') + scores = [score.text for score in scores] + quotes = soup.select('p.quote > span') + quotes = [quote.text for quote in quotes] + pattern = r"https://movie.douban.com/subject/\w+/" + hrefs = re.findall(pattern, str(html))[::2] + title_list = soup.select('div.pic > a') + try: + titles = [re.findall(r'alt="(.*?)"', str(title))[0] + for title in title_list] + img_links = [re.findall(r'src="(.*?)"', str(src))[0] + for src in title_list] + except IndexError: + pass + return img_links, titles, scores, quotes, hrefs + + async def save_info(self, page): + url = self.url.format(page) + # print(url) + with await sem: + html = await self.fetch(url, headers) + img_links, titles, scores, quotes, hrefs = self.infos_get(html) + for infos in zip(img_links, titles, scores, quotes, hrefs): + info = {'img': infos[0], + 'name': infos[1], + 'score': infos[2], + 'quote': infos[3], + 'href': infos[4] + } + count = coll.find({"name": infos[1]}).count() + if count == 0: + coll.insert(info) + + +if __name__ == '__main__': + url = 'https://movie.douban.com/top250?start={}&filter=' + headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \ + (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'} + client = MongoClient('localhost', 27017) + db = client.movies + coll = db.coll + douban = DouBanCrawl(url) + pages = range(0, 250, 25) + sem = asyncio.Semaphore(4) # 限制协程并发量 + loop = asyncio.get_event_loop() + f = asyncio.wait([douban.save_info(page) for page in pages]) + loop.run_until_complete(f) # %time 为Ipython 自带功能模块 + print('Done') diff --git "a/\344\270\200\344\270\252\345\274\202\346\255\245\347\210\254\350\231\253.md" "b/\344\270\200\344\270\252\345\274\202\346\255\245\347\210\254\350\231\253.md" index 5508e4b..c4484b9 100644 --- "a/\344\270\200\344\270\252\345\274\202\346\255\245\347\210\254\350\231\253.md" +++ "b/\344\270\200\344\270\252\345\274\202\346\255\245\347\210\254\350\231\253.md" @@ -1,8 +1,5 @@ # 一个异步爬虫 -标签(空格分隔): python - ---- 看了好多天的异步,今天终于算是大致理解了。模仿着写了一个异步小爬虫。以前很不理解哪里要使用异步,搞的头大。对于爬虫来说,耗时的地方是对服务器的请求,于是把对网页的请求使用异步即可! ```python diff --git "a/\345\205\250\351\235\242\346\224\276\345\274\200\344\272\214\345\255\251\346\224\277\347\255\226\350\203\214\346\231\257\344\270\213\350\202\262\351\276\204\345\261\205\346\260\221\347\224\237\350\202\262\346\204\217\345\220\221\350\260\203\346\237\245.pdf" "b/\345\205\250\351\235\242\346\224\276\345\274\200\344\272\214\345\255\251\346\224\277\347\255\226\350\203\214\346\231\257\344\270\213\350\202\262\351\276\204\345\261\205\346\260\221\347\224\237\350\202\262\346\204\217\345\220\221\350\260\203\346\237\245.pdf" deleted file mode 100644 index 1989bbd..0000000 Binary files "a/\345\205\250\351\235\242\346\224\276\345\274\200\344\272\214\345\255\251\346\224\277\347\255\226\350\203\214\346\231\257\344\270\213\350\202\262\351\276\204\345\261\205\346\260\221\347\224\237\350\202\262\346\204\217\345\220\221\350\260\203\346\237\245.pdf" and /dev/null differ diff --git "a/\346\226\260\346\265\252\345\215\232\345\256\242\346\226\207\346\234\254\350\201\232\347\261\273.md" "b/\346\226\260\346\265\252\345\215\232\345\256\242\346\226\207\346\234\254\350\201\232\347\261\273.md" new file mode 100644 index 0000000..2d32cb3 --- /dev/null +++ "b/\346\226\260\346\265\252\345\215\232\345\256\242\346\226\207\346\234\254\350\201\232\347\261\273.md" @@ -0,0 +1,268 @@ +# 新浪博客文本聚类 + +标签(空格分隔): python + +--- +### 前言 +这是年前帮一位 QQ 好友完成论文报告所写的程序。有待完善。 + +----- +以下是本次报告所使用的程序, 全部使用 `Python` 编写。根据需要,编写了以下四个程序。 + +1. `article_base_info.py ` 用于抓取文章的基本信息:文章标题、链接、作者、发表日期 +2. `article_content_gevent.py` 用于抓取文章内容 +3. `text_category.py` 对文章进行分类 +4. `format_data.py` 格式化数据 + +下面是程序代码 +```python +# coding: utf-8 +''' +程序: article_base_info.py +1. 此程序通过给定的页数抓取新浪博客文章的基本信息:文章标题、链接、作者、发表日期 +2. 数据保存到MongoDB中 +''' +import re +import concurrent.futures +import requests +from bs4 import BeautifulSoup as bs +from pymongo import MongoClient + + +def fetch(url): + res = requests.get(url) + res.encoding = 'gbk' + content = bs(res.text, 'lxml') + return content + + +def base_info(html): + pattern = re.compile(r'http://blog.sina.com.cn/s/blog_.*\.html') + links = re.findall(pattern, str(html)) + date_ = re.findall(r'\((\d{2,}.*)\)', str(html)) + tle_auth = html.select('li') + authes = (auth.text.split(' ')[0] for auth in tle_auth) + titles = (title.text.split(' ')[-1] for title in tle_auth) + for infos in zip(links, titles, authes, date_): + yield infos + + +def save(url): + html = fetch(url) + data = base_info(html) + client = MongoClient('localhost', 27017) + db = client.infos + coll = db.coll + for num, d in enumerate(data, 1): + datum = { + 'links': d[0], + 'title': d[1], + 'auther': d[2], + 'date': d[3] + } + + count = coll.find({'links': d[0]}).count() + if count == 0: + coll.insert_one(datum) + print('{} is grabbed'.format(urls)) + + +if __name__ == '__main__': + url = 'http://roll.blog.sina.com.cn/list/other/index_{}.shtml' + + start = int(input('请输入开始页数, 默认为1 >> ')) + if not start: + start = 1 + + end = int(input('输入结束页数, 默认为100 >> ')) + if not end: + end = 100 + + pages = range(start, end + 1) + urls = [url.format(page) for page in pages] + + with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor: + executor.map(save, urls) + +``` +----------- +```python +# -*-coding: utf-8 -*- +''' +程序: article_content_gevent.py +1. 此程序是用来抓取新浪博客的文章内容的! +2.文章链接从 *筛选后所有博客数据.csv* 读取, 此 .csv 文件由 article_base_info.py 抓取生成。 +3. 由于作者删帖或者其他方面的原因,导致文章本身已不存在而其链接仍存在的现象。 +''' + +import os +import csv +import logging +import requests +import gevent +from bs4 import BeautifulSoup as bs + + +def fetch(url): + res = requests.get(url) + res.encoding = 'utf-8' + content = bs(res.text, 'lxml') + if not content: + logging.warning('The blog have been deleted!') + return content + + +def content_get(html): + try: + artical = html.select('#sina_keyword_ad_area2')[0].text.strip() + except IndexError as e: + print(e) + logging.warning('the page is None') + artical = ' ' + return artical + + +def links_get(filename, urls=None): + with open(filename, 'r') as csvfile: + logging.info('readed the file {}'.format(filename)) + reader = csv.reader(csvfile) + if urls is None: + urls = [] + urls = [row[0] for row in reader] + return urls + + +def download(url): + html = fetch(url) + artical = content_get(html) + with open('/home/mouse/Documents/artical/{}.txt' + .format(url[-12:-5]), 'w') as f: + f.write(artical) + logging.info('writring the {}'.format(url)) + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s %(message)s', + level=logging.WARNING) + filename = '/home/mouse/我的坚果云/董姐的论文所需/筛选后所有博客数据.csv' + urls = links_get(filename) + if not os.path.isdir('/home/mouse/Documents/artical/'): + os.makedirs('/home/mouse/Documents/artical/') + threads = [gevent.spawn(download, url) for url in urls] + gevent.joinall(threads) + +``` +--- +``` +# coding=utf-8 +''' +程序: text_category.py +1. 此程序用于对从新浪博客抓取的文章进行自动分类 +2. 分类所使用的库来自 https://github.com/2shou/TextGrocery +3. 工作流 读取已抓取的文章标题 -> 对文章分类 -> 写入分类后的文章标题 +''' +import os +import csv +from tgrocery import Grocery +from train_txt import train_src + + +def category(title_lst, cates=None): # 对文章分类 + if cates is None: + cates = [] + for title in title_lst: + cate = new_grocery.predict(title) + cates.append(cate.predicted_y) + return cates + + +def get_artical_title(filename, title_lst=None): # 读取文本 + if title_lst is None: + title_lst = [] + + with open(filename, 'r') as f1: + f1_csv = csv.reader(f1) + title_lst = [row[1] for row in f1_csv] + + return title_lst + + +def write_cated_info(filename, new_filename): # 写入已分类的文章 + titles = get_artical_title(filename) + categ = category(titles) + with open(filename, 'r') as read_file: + reader = csv.reader(read_file) + for i, row in enumerate(reader): + row.append(categ[i]) + with open(new_filename, 'a+') as write_file: + writer = csv.writer(write_file) + writer.writerow(row) + + print 'writing the {} item'.format(i) + print 'Done....................' + + +if __name__ == "__main__": + # filename和new_filename是文件路径,保存读取和写入的文件 + # 更改路径名即可对不同的数据分类,前提要符合一定格式 + filename = '/home/mouse/我的坚果云/董姐的论文所需/female7.csv' + new_filename = '/home/mouse/我的坚果云/董姐的论文所需/female7_2.csv' + if os.path.isfile(new_filename): + os.remove(new_filename) + grocery = Grocery('sample') + grocery.train(train_src) + grocery.save() + new_grocery = Grocery('sample') + new_grocery.load() + write_cated_info(filename, new_filename) +``` +--- +```python +# -*- coding: utf-8 -*- +''' +程序: format_data.py +此程序是一个辅助程序,用于对 *标题整理数据.xlsx* 的格式化,标题整理数据转换为 csv 格式 +''' + +import csv +from collections import namedtuple +cate = ['社会冲突和问题', '毛泽东思想与政策', '政党与政府设置', '民主与法治', '民族和国际关系', + '媒体与言论自由', '资本主义与市场经济', '全球化和对外开放', '民生与福利', + '家庭冲突与伦理', '传统文化', '性与个人自由', '环境污染', '生态保护', ] +Category = namedtuple( + 'Category', 'social mao govm demcy nation media capi glob live home tran sex env eco') + +filename = '/home/mouse/我的坚果云/董姐的论文所需/标题整理数据2.csv' + + +def train_text(filename, train_src=None): + if train_src is None: + train_src = [] + + def format_cate(): + for emp in map(Category._make, csv.reader(open(filename, 'r'))): + social = (cate[0], emp.social) + mao = (cate[1], emp.mao) + govm = (cate[2], emp.govm) + demcy = (cate[3], emp.demcy) + nation = (cate[4], emp.nation) + media = (cate[5], emp.media) + capi = (cate[6], emp.capi) + glob = (cate[7], emp.glob) + live = (cate[8], emp.live) + home = (cate[9], emp.home) + tran = (cate[10], emp.tran) + sex = (cate[11], emp.sex) + env = (cate[12], emp.env) + eco = (cate[13], emp.eco) + yield social, mao, govm, demcy, nation, media, capi, glob, \ + live, home, tran, sex, env, eco + + for cat in format_cate(): + train_src.extend(list(cat)) + + return train_src +``` +以上程序均有本人编写,并且全部我的电脑上运行通过,但未在其他电脑和平台上测试,由于各种依赖和兼容性问题以及本人水平有限,不保证他人也能正常运行此程序。 + + diff --git "a/\346\262\252\346\267\261\346\270\257\351\200\232\345\255\243\345\272\246\345\207\200\346\265\201\345\205\245.py" "b/\346\262\252\346\267\261\346\270\257\351\200\232\345\255\243\345\272\246\345\207\200\346\265\201\345\205\245.py" new file mode 100644 index 0000000..4977df4 --- /dev/null +++ "b/\346\262\252\346\267\261\346\270\257\351\200\232\345\255\243\345\272\246\345\207\200\346\265\201\345\205\245.py" @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[10]: + + +import re +import os +import json +import csv +import sys +import pandas as pd +import datetime +import time +import requests as request + +headers = {'HdDate', 'SCode', 'SName', 'NewPrice', 'ShareSZ_Chg_One', 'ShareSZ_Chg_Rate_One', 'LTZB_One', 'ZZB_One'} + +today = datetime.date.today() +fname = str(today)+".xlsx" +fname1 = "PPOS_POTE_"+fname +fname2 = "PPOS_POTE_SZ_"+fname + +file_path = 'C:\\eastmoney' +if not os.path.exists(file_path): + os.mkdir(file_path) +os.chdir(file_path) + +def get_html(page): + url = 'http://dcfm.eastmoney.com/EM_MutiSvcExpandInterface/api/js/get?type=HSGT20_GGTJ_SUM&token=894050c76af8597a853f5b408b759f5d&st=ShareSZ_Chg_One&sr=-1&p='+str(page)+'&ps=50&js=var%20Hyeikcqr={pages:(tp),data:(x)}&filter=(DateType=%27jd%27%20and%20HdDate=%272021-02-10%27)&rt=53772857' + res = request.get(url).text + pat = re.compile('data:(.*)}', re.S) + result = re.search(pat, res).group(1) + data = json.loads(result) + return data + +def get_one_page_stock(page): + rows = [] + data = get_html(page) + + for a in data: + row = {key: value for key, value in a.items() if key in headers} + rows.append(row) + return rows + +def get_all_stock(): + all_rows = [] + for page in range(1, 31): + print('\n正在下载第 %s 页表格' % page) + rows = get_one_page_stock(page) + all_rows.extend(rows) + print("下载已完成。。。。。") + return all_rows + +def get_pd(): + + all_rows = get_all_stock() + df = pd.DataFrame(all_rows) + df.columns = ['日期', '代码', '名称', '最新股价' , '市值', '市值增幅', '占流通股比', '占总股比'] + try: + df.to_excel(r'C:\eastmoney\%s' % fname) + except Exception as e: + print("请关闭文件后再试", e) + return df + +def good(f): + df1 = f.nlargest(20, '占总股比') + df2 = f.nlargest(20, "占流通股比") + df3 = f.nlargest(10, "市值") + df1_df2 = pd.merge(df1, df2, on=list(f.columns), how='inner') + df1_df2_df3 = pd.merge(df1_df2, df3, on=list(f.columns), how='inner') + try: + df1_df2.to_excel(r'C:\eastmoney\%s' % fname1) + except Exception as e: + print("请关闭文件后再试", e) + + try: + df1_df2_df3.to_excel(r'C:\eastmoney\%s' % fname2) + except Exception as e: + print("请关闭文件后再试", e) + +def main(): + start_time = time.time() + if os.path.exists(fname): + df = pd.read_excel(fname) + good(df) + else: + df = get_pd() + good(df) + + end_time = time.time() - start_time + print('文件保存在C盘eastmoney文件夹下') + print('程序耗时:{:.1f} s'.format(end_time)) + +main() + diff --git "a/\350\261\206\347\223\243\347\224\265\345\275\261Top250 \347\210\254\350\231\253.md" "b/\350\261\206\347\223\243\347\224\265\345\275\261Top250 \347\210\254\350\231\253.md" index e3e2fcd..aafb1d8 100644 --- "a/\350\261\206\347\223\243\347\224\265\345\275\261Top250 \347\210\254\350\231\253.md" +++ "b/\350\261\206\347\223\243\347\224\265\345\275\261Top250 \347\210\254\350\231\253.md" @@ -1,12 +1,95 @@ -# 豆瓣电影Top250 爬虫 +### 爬取豆瓣电影top250。 +--- +2016-11-04 更新 + +使用 mongoDB 存储 +--- +**本次更新** +抓取电影的如下简单信息 -标签(空格分隔): python +- 电影名 +- 封面 +- 评分 +- 评价人数 +- quote +- 链接 --- +```python +# coding=utf-8 + +import logging +import re +import aiohttp +import asyncio +from bs4 import BeautifulSoup +from pymongo import MongoClient + + +class DouBanCrawl(): + + def __init__(self, url): + self.url = url + + async def fetch(self, url, headers): + res = await aiohttp.request('GET', url) + body = res.read() + return (await body) + + def infos_get(self, html, name=None): + soup = BeautifulSoup(html, 'lxml') + scores = soup.select('.rating_num') + scores = [score.text for score in scores] + quotes = soup.select('p.quote > span') + quotes = [quote.text for quote in quotes] + pattern = r"https://movie.douban.com/subject/\w+/" + hrefs = re.findall(pattern, str(html))[::2] + title_list = soup.select('div.pic > a') + try: + titles = [re.findall(r'alt="(.*?)"', str(title))[0] + for title in title_list] + img_links = [re.findall(r'src="(.*?)"', str(src))[0] + for src in title_list] + except IndexError: + pass + return img_links, titles, scores, quotes, hrefs -## 爬取豆瓣电影top250。 + async def save_info(self, page): + url = self.url.format(page) + # print(url) + with await sem: + html = await self.fetch(url, headers) + img_links, titles, scores, quotes, hrefs = self.infos_get(html) + for infos in zip(img_links, titles, scores, quotes, hrefs): + info = {'img': infos[0], + 'name': infos[1], + 'score': infos[2], + 'quote': infos[3], + 'href': infos[4] + } + count = coll.find({"name": infos[1]}).count() + if count == 0: + coll.insert(info) -## 1. 单线程版 + +if __name__ == '__main__': + url = 'https://movie.douban.com/top250?start={}&filter=' + headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \ + (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'} + client = MongoClient('localhost', 27017) + db = client.movies + coll = db.coll + douban = DouBanCrawl(url) + pages = range(0, 250, 25) + sem = asyncio.Semaphore(4) # 限制协程并发量 + loop = asyncio.get_event_loop() + f = asyncio.wait([douban.save_info(page) for page in pages]) + loop.run_until_complete(f) # %time 为Ipython 自带功能模块 + print('Done') +``` + +**以下为以前内容** +#### 1. 单线程版 ```python # -*- coding: utf-8 -*- @@ -52,7 +135,7 @@ Out: CPU times: user 1.11 s, sys: 8 ms, total: 1.12 s Wall time: 3.58 s ``` -## 2. 多线程版 +#### 2. 多线程版 ```python # -*- coding: utf-8 -*- @@ -99,7 +182,7 @@ if __name__ == '__main__': Out: CPU times: user 1.16 s, sys: 172 ms, total: 1.33 s Wall time: 1.28 s ``` -### 使用线程池 +#### 使用线程池 线程的创建和销毁是一个比较重的开销。所以,使用线程池,重用线程池中的线程! ```python @@ -115,7 +198,7 @@ Out: CPU times: user 1.23 s, sys: 152 ms, total: 1.38 s Wall time: 1.29 s ``` 再加上一个异步的吧 -## 3. 异步版 +#### 3. 异步版 此版本使用的是异步库`asyncio`和对其进行深度封装的库`aiohttp`。 ```python # coding=utf-8 @@ -159,10 +242,60 @@ if __name__ == '__main__': Out: CPU times: user 984 ms, sys: 28 ms, total: 1.01 s Wall time: 1.67 s ``` +#### 4. 使用下 Gevent 看看效果如何。 +```python +# coding=utf-8 + +import re +import requests +import gevent +from gevent.pool import Pool +from bs4 import BeautifulSoup as bs + + +def fetch(url): + s = requests.Session() + s.headers.update({"user-agent": user_agent}) + return s.get(url) + + +def title_get(url): + try: + result = fetch(url) + except requests.exceptions.RequestException: + return False + html = bs(result.text, 'lxml') + title_list = html.select('div.pic > a > img') + ''' + title_list中的元素格式如下 e.g: +