临街小站

CSDN访问量作弊器

前言

前些天在逛论坛的时候突然发现了一篇文章,标题是通过编程自动化提高自己博客访问量的。我想了下,突然感觉可以用学过的Python的简单知识来实现这一目的。主要原理就是BeautifulSoup+urllib的组合,通过BS解析网页,获取目录,然后深入,获取文章的url,通过urllib.request模块尝试连接CSDN的服务器。说干就干

脚本实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from bs4 import BeautifulSoup

import urllib.request
import urllib.parse
import sys
import time


#运行过程中的日志函数
def LOG(*argv):
sys.stderr.write(*argv)
sys.stderr.write('\n')

class Grab():
url = ''
soup = None

#读取当前网页的源代码数据返回
def GetPage(self, url):
self.url = url
LOG('input url is: %s' % self.url)
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
try:
page = urllib.request.urlopen(req)
except:
return
tem = page.read()
if not tem:
print('GetPage failed!')
sys.exit()
return tem

#获取目录页面下的文章url集合
def ExtractInfo(self,buf):
dom=etree.HTML(buf)
links=dom.xpath('//h3[@class="list_c_t"]/a/@href')
titles=dom.xpath('//h3[@class="list_c_t"]/a/text()')
for i in range(0,len(links)):links[i]='http://blog.csdn.net'+links[i]
for i in range(0,len(titles)):titles[i]=titles[i].strip()
return links,titles

#获取所有文章的目录页面url集合
def GetPageUrl(self,buf):
pages = set()
self.soup = BeautifulSoup(buf,'html.parser')
pageInfo=self.soup.find(attrs={'id':'papelist'})
#如果当前文章数量只有一页
if not pageInfo:
return None
pagelinks = pageInfo.findAll('a')
for link in pagelinks:
pages.add('http://blog.csdn.net/'+link['href'])
return pages

#获取当前访问文章的访问数、文章标题
def GetCurViewerPoint(self,buf):
self.soup = BeautifulSoup(buf,'html.parser')
pointobj = (self.soup.find(attrs={'class':'read_r'})).label.span.string
title = (self.soup.find(attrs={'class':'list_c_t'})).get_text()
pointobj=pointobj[2:len(pointobj)-1]
return title+' 当前阅读数:'+pointobj

grab = Grab()

#buf是当前页面经过转换之后的网页源代码
buf = grab.GetPage('http://blog.csdn.net/peihaozhu')

#pages中存放的是目录页面url集合
pages = ['http://blog.csdn.net/peihaozhu',]

#先从入口进入,如果文章数量不够,文章的目录页面只有一页
tem = grab.GetPageUrl(buf)
if not tem:
pass
else:
pages+=tem

#articles中存放所有的文章url集合
articles=set()

for page in pages:
buf = grab.GetPage(page)
links = grab.ExtractInfo(buf)
for url in links:
articles.add('http://blog.csdn.net/'+url)

#通过url.request访问文章
for url in articles:
for i in range(1,11):
buf=grab.GetPage(url)
print('第'+str(i)+'次访问 '+grab.GetCurViewerPoint(buf))
#每次访问之后停歇300ms
time.sleep(0.3)

UI实现

用PyQt5将程序的大致控件摆放完成了:

将pyqt生成的ui文件直接通过命令生成.py文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
pyuic5.bat -o layout.py untitled.ui

#代码如下
# -*- coding: utf-8 -*-

# Form implementation generated from reading ui file 'untitled.ui'
#
# Created by: PyQt5 UI code generator 5.5.1
#
# WARNING! All changes made in this file will be lost!

from PyQt5 import QtCore, QtGui, QtWidgets

class Ui_Form(object):
def setupUi(self, Form):
Form.setObjectName("Form")
self.label = QtWidgets.QLabel(Form)
self.label.setGeometry(QtCore.QRect(20, 30, 81, 21))
self.label.setObjectName("label")
self.username = QtWidgets.QPlainTextEdit(Form)
self.username.setGeometry(QtCore.QRect(110, 20, 341, 41))
self.username.setObjectName("username")
self.label_2 = QtWidgets.QLabel(Form)
self.label_2.setGeometry(QtCore.QRect(30, 80, 61, 31))
self.label_2.setObjectName("label_2")
self.times = QtWidgets.QPlainTextEdit(Form)
self.times.setGeometry(QtCore.QRect(110, 80, 151, 41))
self.times.setObjectName("times")
self.beginBtn = QtWidgets.QPushButton(Form)
self.beginBtn.setGeometry(QtCore.QRect(300, 80, 61, 41))
self.beginBtn.setObjectName("beginBtn")
self.progressBar = QtWidgets.QProgressBar(Form)
self.progressBar.setGeometry(QtCore.QRect(30, 350, 461, 41))
self.progressBar.setProperty("value", 24)
self.progressBar.setObjectName("progressBar")
self.listView = QtWidgets.QListView(Form)
self.listView.setGeometry(QtCore.QRect(30, 180, 431, 151))
self.listView.setObjectName("listView")
self.info = QtWidgets.QLabel(Form)
self.info.setGeometry(QtCore.QRect(30, 140, 421, 31))
self.info.setText("")
self.info.setObjectName("info")
self.exitBtn = QtWidgets.QPushButton(Form)
self.exitBtn.setGeometry(QtCore.QRect(390, 80, 61, 41))
self.exitBtn.setObjectName("exitBtn")

self.retranslateUi(Form)
self.exitBtn.clicked.connect(Form.close)
QtCore.QMetaObject.connectSlotsByName(Form)

def retranslateUi(self, Form):
_translate = QtCore.QCoreApplication.translate
Form.setWindowTitle(_translate("Form", "Form"))
self.label.setText(_translate("Form", " CSDN用户名"))
self.label_2.setText(_translate("Form", "设置次数"))
self.beginBtn.setText(_translate("Form", "Start"))
self.exitBtn.setText(_translate("Form", "Exit"))

UI、逻辑处理

遇到的一些问题

在这次编写GUI的过程中,我遇到了原来没有的问题。

以往的时候,如上篇文章,通过Python的QR模块生成QR二维码,因为逻辑非常简单,只是单纯的将所需要转换的数据变换成为相应的0、1二进制码,然后放到图片中的相应位置上,所以不会花费太多的时间,逻辑部分与界面部分就直接写在了一起没有问题。

这次刚开始的时候,我也没注意,直接就还是写在一块,由于牵扯到了url网络连接部分,所以不可避免的出现了阻塞现象。几乎在所有的GUI设计中,如果当长时间出现阻塞、无状态回应情况,都会出现界面的未响应状态,所以我想到了在Android开发中相当常规的子线程与UI线程通信,Handler的使用,在PyQt中也有类似的机制,也就是Qt的核心机制,信号槽机制,更多的内容可以看我另外的文章,我会详细的介绍下。

下面是我修改完成后的代码,可以顺利完成我预设的功能:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import urllib.request
import urllib.parse
import time
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtCore import pyqtSignal
from bs4 import BeautifulSoup

#抓取网页的类
class Grab():
url = ''
soup = None

#读取当前网页的源代码数据返回
def GetPage(self, url):
self.url = url
LOG('input url is: %s' % self.url)
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
try:
page = urllib.request.urlopen(req)
except:
return
tem = page.read()
if not tem:
print('GetPage failed!')
sys.exit()
return tem

#获取目录页面下的文章url集合
def ExtractInfo(self,buf):
dom=etree.HTML(buf)
links=dom.xpath('//h3[@class="list_c_t"]/a/@href')
titles=dom.xpath('//h3[@class="list_c_t"]/a/text()')
for i in range(0,len(links)):links[i]='http://blog.csdn.net'+links[i]
for i in range(0,len(titles)):titles[i]=titles[i].strip()
return links,titles

#获取所有文章的目录页面url集合
def GetPageUrl(self,buf):
pages = set()
self.soup = BeautifulSoup(buf,'html.parser')
pageInfo=self.soup.find(attrs={'id':'papelist'})
#如果当前文章数量只有一页
if not pageInfo:
return None
pagelinks = pageInfo.findAll('a')
for link in pagelinks:
pages.add('http://blog.csdn.net/'+link['href'])
return pages

#获取当前访问文章的访问数、文章标题
def GetCurViewerPoint(self,buf):
self.soup = BeautifulSoup(buf,'html.parser')
pointobj = (self.soup.find(attrs={'class':'read_r'})).label.span.string
title = (self.soup.find(attrs={'class':'list_c_t'})).get_text()
pointobj=pointobj[2:len(pointobj)-1]
return title+' 当前阅读数:'+pointobj



#界面类
class Ui_Form(object):
def setupUi(self, Form):
Form.setObjectName("Form")
self.label = QtWidgets.QLabel(Form)
self.label.setGeometry(QtCore.QRect(20, 30, 81, 21))
self.label.setObjectName("label")
self.username = QtWidgets.QPlainTextEdit(Form)
self.username.setGeometry(QtCore.QRect(110, 20, 341, 41))
self.username.setObjectName("username")
self.label_2 = QtWidgets.QLabel(Form)
self.label_2.setGeometry(QtCore.QRect(30, 80, 61, 31))
self.label_2.setObjectName("label_2")
self.times = QtWidgets.QPlainTextEdit(Form)
self.times.setGeometry(QtCore.QRect(110, 80, 151, 41))
self.times.setObjectName("times")
self.beginBtn = QtWidgets.QPushButton(Form)
self.beginBtn.setGeometry(QtCore.QRect(300, 80, 61, 41))
self.beginBtn.setObjectName("beginBtn")
self.progressBar = QtWidgets.QProgressBar(Form)
self.progressBar.setGeometry(QtCore.QRect(30, 350, 461, 41))
self.progressBar.setProperty("value", 0)
self.progressBar.setObjectName("progressBar")
self.listWidget = QtWidgets.QListWidget(Form)
self.listWidget.setGeometry(QtCore.QRect(30, 180, 431, 151))
self.listWidget.setObjectName("listWidget")
self.info = QtWidgets.QLabel(Form)
self.info.setGeometry(QtCore.QRect(30, 140, 421, 31))
self.info.setText("")
self.info.setObjectName("info")
self.exitBtn = QtWidgets.QPushButton(Form)
self.exitBtn.setGeometry(QtCore.QRect(390, 80, 61, 41))
self.exitBtn.setObjectName("exitBtn")

self.thread=MyThread()
self.thread.sinOut.connect(self.handler)

self.retranslateUi(Form)
self.exitBtn.clicked.connect(Form.close)
self.beginBtn.pressed.connect(self.mainFunc)
QtCore.QMetaObject.connectSlotsByName(Form)

def handler(self,type,text,content):
if type == 1:
self.listWidget.addItems(content)
elif type == 2:
self.progressBar.setProperty("value", float(text))
elif type == 3:
self.info.setText(text)


def mainFunc(self):
username = self.username.toPlainText().strip()
times = self.times.toPlainText().strip()
if username and times:
self.thread.setVal(username,times)
self.thread.start()

def retranslateUi(self, Form):
_translate = QtCore.QCoreApplication.translate
Form.setWindowTitle(_translate("Form", "Blog作弊器"))
self.label.setText(_translate("Form", " CSDN用户名"))
self.label_2.setText(_translate("Form", "设置次数"))
self.beginBtn.setText(_translate("Form", "Start"))
self.exitBtn.setText(_translate("Form", "Exit"))


#子线程
class MyThread(QtCore.QThread):
sinOut = pyqtSignal(int,str,set)
articles = set()
def __init__(self):
super(MyThread,self).__init__()
self.username=''
self.times=''

def setVal(self,username,times):
self.username=username
self.times=times

def run(self):
#发射信号
grab = Grab()
buf = grab.GetPage('http://blog.csdn.net/'+self.username)
pages = ['http://blog.csdn.net/'+self.username,]
tem = grab.GetPageUrl(buf)
content = set()
links = []
titles = []
if not tem:pass
else: pages+=tem
for page in pages:
buf = grab.GetPage(page)
link,title = grab.ExtractInfo(buf)
links+=link
titles+=title
titles=zip(links,titles)
for link in links:
self.articles.add(link)
for title in titles:
tem = ''
for val in title:
tem+=val+' '
content.add(tem)

self.sinOut.emit(1,'',content)
sumRes = len(self.articles)*int(self.times)
cur = 1
for url in self.articles:
for i in range(0,int(self.times)):
buf=grab.GetPage(url)
self.sinOut.emit(2,str(cur/sumRes*100),content)
self.sinOut.emit(3,grab.GetCurViewerPoint(buf),content)
cur+=1
time.sleep(0.1)


if __name__=='__main__':
import sys
app=QtWidgets.QApplication(sys.argv)
widget=QtWidgets.QWidget()
ui=Ui_Form()
ui.setupUi(widget)
widget.show()
sys.exit(app.exec_())

总结

由于使用了designer默认的绝对布局方式,代码比较杂乱。总的来说也就分3个模块:

  1. 网页获取、解析工作类 Grab

  2. 界面布局、实时数据展现类 Ui_Form

  3. 逻辑控制、监控与沟通类 MyThread

各个模块相互合作,实现功能.

通过Requests、Xpath改写

这是后来补上的.

在我完成这篇文章之前的部分的时候,还是对python刚入门,使用了比较经典的一些网络模块。后来通过一些实例练习,接触到了一些简洁、优雅的模块,通过这些模块继续完善。

Grab类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
class Grab():
url = ''
soup = None
#读取当前网页的源代码数据返回
#读取当前网页的源代码数据返回
def GetPage(self, url):
self.url = url
LOG('input url is: %s' % self.url)
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
try:
page = urllib.request.urlopen(req)
except:
return
tem = page.read()
if not tem:
print('GetPage failed!')
sys.exit()
return tem

#获取目录页面下的文章url集合
def ExtractInfo(self,buf):
dom=etree.HTML(buf)
links=dom.xpath('//h3[@class="list_c_t"]/a/@href')
titles=dom.xpath('//h3[@class="list_c_t"]/a/text()')
for i in range(0,len(links)):links[i]='http://blog.csdn.net'+links[i]
for i in range(0,len(titles)):titles[i]=titles[i].strip()
return links,titles

#获取所有文章的目录页面url集合
def GetPageUrl(self,buf):
pages = set()
self.soup = BeautifulSoup(buf,'html.parser')
pageInfo=self.soup.find(attrs={'id':'papelist'})
#如果当前文章数量只有一页
if not pageInfo:
return None
pagelinks = pageInfo.findAll('a')
for link in pagelinks:
pages.add('http://blog.csdn.net/'+link['href'])
return pages

#获取当前访问文章的访问数、文章标题
def GetCurViewerPoint(self,buf):
self.soup = BeautifulSoup(buf,'html.parser')
pointobj = (self.soup.find(attrs={'class':'read_r'})).label.span.string
title = (self.soup.find(attrs={'class':'list_c_t'})).get_text()
pointobj=pointobj[2:len(pointobj)-1]
return title+' 当前阅读数:'+pointobj
  • Request模块

上网认证这篇文章有过大概的介绍。模块通过提供极其简单的方法名称接口,隐藏了复杂的网络工作,大大简化了代码。

  • Xpath

Xpath介绍Xpath不是一个模块,而是活跃在众多平台的一种工具,也可以称她是一种语言。通过对网页源代码解析,内部构建路径,轻松获取想要的内容。在BeautifulSoup太沉重复杂、又不想使用正则的情况下,是一种很棒的解决方案。

MyThread类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
class MyThread(QtCore.QThread):
sinOut = pyqtSignal(int,str,set)
articles = set()
global username
def __init__(self):
super(MyThread,self).__init__()
self.times=''

def setVal(self,username,times):
self.times=times

def run(self):
#发射信号
grab = Grab()
#获取各个目录页面信息
buf = grab.GetPage('http://blog.csdn.net/'+username)
pages = grab.GetPageUrl(buf)

content = set()
links = []
titles = []
for page in pages:
buf = grab.GetPage(page)
link,title = grab.ExtractInfo(buf)
links+=link
titles+=title
titles=zip(links,titles)
for link in links:
self.articles.add(link)
for title in titles:
tem = title[0]+' '+title[1]
content.add(tem)

self.sinOut.emit(1,'',content)
sumRes = len(self.articles)*int(self.times)
cur = 1
for i in range(0,int(self.times)):
for url in self.articles:
buf=grab.GetPage(url)
self.sinOut.emit(2,str(cur/sumRes*100),content)
self.sinOut.emit(3,grab.GetCurViewerPoint(buf),content)
cur+=1
time.sleep(0.1)

去掉time.sleep(seconds)推迟线程调用之后,会出现网络模块报错,这应该是所有网络模块都会碰到的情况。

软件下载

clinjie wechat
Think about u every day