-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMainThread.cpp
566 lines (525 loc) · 13.1 KB
/
MainThread.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
////////////////////////////////////////////////////
//设计者姓名:LWZ小组 刘克东 00348231
//项目名:大实习-搜索引擎-网络化爬虫
//创建日期:2004-12-10
//最近一次修改日期:2004-12-26
//
//全局变量:
// 用到NetCrawler.cpp中定义全局变量
//extern CNetCrawlerDlg *pDlg; 主窗口的指针
//extern bool ThreadPause; 是否暂停线程
//注:工作者线程(worker thread)的传入函数不能为类中的成员函数,
//故将传入函数声明为全局函数
// MainThread.cpp : implementation file
////////////////////////////////////////////////////
#include "stdafx.h"
#include "NetCrawler.h"
#include <afxmt.h>
#include "DownloadData.h"
#include "ProjectDlg.h"
#include "MainThread.h"
#include "NetCrawlerDlg.h"
#include <afxinet.h>
#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif
extern CNetCrawlerDlg *pDlg;
extern bool ThreadPause;
/////////////////////////////////////////////////////////////////////////////
// MainThread
IMPLEMENT_DYNCREATE(MainThread, CWinThread)
//用户界面线程构造函数
MainThread::MainThread()
{
m_chinese=TRUE;
m_bDone=FALSE;
}
MainThread::~MainThread()
{
}
//初始化函数
BOOL MainThread::InitInstance()
{
// TODO: perform and per-thread initialization here
//生成一个新建工程对话框
CProjectDlg NewProjectDlg;
if(NewProjectDlg.DoModal()==IDOK)
{
//一些关于共享数据区的设置
m_DownData.SetPro(NewProjectDlg.m_FileId,NewProjectDlg.m_ThreadNum,NewProjectDlg.m_LocalDir);
//关于主控线程的设置
//起始地址的设置
NewProjectDlg.m_BeginURL.MakeLower();
if(NewProjectDlg.m_BeginURL.Find(_T("http://"))==-1)
{
str_BeginURL=_T("http://")+NewProjectDlg.m_BeginURL;
}
else
{
str_BeginURL=NewProjectDlg.m_BeginURL;
}
//工程名的设置
str_ProjectName=NewProjectDlg.m_ProjectName;
//URL过滤字符串的设置
if(NewProjectDlg.m_Filter=="")
{
str_Confine=str_BeginURL.Mid(11,3);
}
else
{
str_Confine=NewProjectDlg.m_Filter;
}
//语言设置
m_chinese=NewProjectDlg.m_CH;
//提示
AfxMessageBox(_T("选择该URL作为起点:")+str_BeginURL);
CWnd *button;
button=pDlg->GetDlgItem(IDC_BUTTON_NEW);
button->EnableWindow(FALSE);
//运行工程
Run(str_BeginURL);
}
return TRUE;
}
int MainThread::ExitInstance()
{
CWnd *button;
button=pDlg->GetDlgItem(IDC_BUTTON_NEW);
button->EnableWindow(TRUE);
// TODO: perform any per-thread cleanup here
return CWinThread::ExitInstance();
}
BEGIN_MESSAGE_MAP(MainThread, CWinThread)
//{{AFX_MSG_MAP(MainThread)
// NOTE - the ClassWizard will add and remove mapping macros here.
//}}AFX_MSG_MAP
END_MESSAGE_MAP()
/////////////////////////////////////////////////////////////////////////////
//Global function
//函数名称:FindURL
//函数功能描述:全局函数
// 被工作者线程调用,从网页中提取URL
//函数调用之前的预备条件:
// 网页已经从网络上下载到本地存为临时文件
//返回后的处理:
// 删除临时文件
//函数的输入参数:
// CString s 临时文件的本地地址
// MainThread *ptr 用于获得主控线程的共享数据区
//函数的抽象算法
// 1、只读方式打开本地文件
// 2、查找连接,若未在共享数据区的URL任务队列中出现,则加入队列
// 3、关闭文件
// 函数与其他对象中函数的调用和被调用关系:
// 被每一个工作者线程调用,来从网页中读取链接
// 工作者线程(worker thread)的传入函数不能为类中的成员函数,
// 故将传入函数声明为全局函数
void FindURL(CString s, MainThread *ptr)
{
CStdioFile fin;
//ReadOnly模式打开该文件
//if file doesn't exist
if(!fin.Open(s,CFile::modeRead))
{
return;
}
CString str_BaseURL;
if(!fin.ReadString(str_BaseURL))
{
return;
}
//the link must begins with "href="
CString mark="href";
int i=-1,j=-1,URL_end=-1;
CString str_Line,str_URL;
//标记网页是否被访问过的标签
bool exist=false;
while(fin.ReadString(str_Line))
{
//read a line from the given file
//判断URL队列是否满了
if(ptr->m_DownData.IsFull())break;
i=str_Line.Find(mark);
//if this line doesn't contain a URL
if(i==-1)continue;
//以下工作为提取一个链接
//filter the URL contained in this line
str_Line=str_Line.Mid(i+4);
//do with URL like "href = "http://..." "
str_Line.TrimLeft();
if(str_Line[0]=='=')
str_Line=str_Line.Mid(1);
str_Line.TrimLeft();
//do with URL in " "
if(str_Line[0]=='\"')
{
URL_end=str_Line.Find(_T("\""),1);
if(URL_end==-1 || URL_end==1)continue;
str_URL=str_Line.Mid(1,URL_end-1);
}
else if(str_Line[0]=='\'')
{
//do with URL in ' '
URL_end=str_Line.Find(_T("\'"),1);
if(URL_end==-1 || URL_end==1)continue;
str_URL=str_Line.Mid(1,URL_end-1);
}
else
{
i=str_Line.Find(_T(">"));
j=str_Line.Find(_T(" "));
if(i==-1)URL_end=j;
else if(j==-1)URL_end=i;
else if(i>j)URL_end=j;
else URL_end=i;
if(URL_end==-1)continue;
str_URL=str_Line.Left(URL_end);
}
//if the URL is a mail link,it should be beglected
if(str_URL.Find(_T("mailto:"))!=-1 ) continue;
//if the URL is #
if(str_URL.Find(_T("#"))!=-1 ) continue;
//if the URL is an internet one,it should be neglected
if(str_URL.Find(_T(".htm"))==-1 &&str_URL.Find(_T(".html"))==-1 && str_URL.Find(_T(".shtml"))==-1
&&str_URL.Find(_T(".shtml"))==-1 && str_URL[str_URL.GetLength()-1]!='/')continue;
//以下工作为转换相对路径
if(str_URL.Find(_T("http:"))==-1)
{
LPTSTR p=new TCHAR[200];
unsigned long m=200;
if(!InternetCombineUrl(str_BaseURL,str_URL,p,&m,ICU_BROWSER_MODE))
{
continue;
}
str_URL=p;
delete []p;
}
if(str_URL.Find(ptr->str_Confine)==-1)continue;
//Judge whether file has already been visited
if(!(ptr->m_DownData.IsExisted(str_URL)))
{
ptr->m_DownData.AddURL(str_URL);
}
}
//关闭
fin.Close();
}
//函数名称:UINT DownloadFile(LPVOID pParam)
//函数功能描述:全局函数
// controlling function for the worker thread
// 从URL任务队列得到一个网址并尝试
//函数的输入参数:
// LPVOID pParam 主控线程的指针,用于获取共享数据区
//函数的抽象算法
// 1、试图从URL队列中获取一个URL,若失败则返回(结束线程)
// 2、根据地址向服务器发送请求,若请求失败则返回(结束线程)
// 3、根据网页,提取主要内容,并存一个临时文件,用FindURL函数查找链接
// 4、从共享数据区删除线程标签
// 5、结束线程
// 工作者线程(worker thread)的传入函数不能为类中的成员函数,
// 故将传入函数声明为全局函数
UINT DownloadFile(LPVOID pParam)
{
MainThread *ptr=(MainThread *)pParam;
CString URL;
//试图获取一个URL
if(!(ptr->m_DownData.GetCurURL(URL)))
{
ptr->m_DownData.DeleThread();
return 0;
}
//以下为建立网络发出请求
CInternetSession MyConnect(_T("Microsoft MFC APP"),1,INTERNET_OPEN_TYPE_DIRECT);
CHttpConnection* pServer = NULL;
CHttpFile* pHttpFile=NULL;
// check to see if this is a reasonable URL
CString strServerName;
CString strObject;
INTERNET_PORT nPort;
DWORD dwServiceType;
try
{
if (!AfxParseURL(URL, dwServiceType, strServerName, strObject, nPort) ||
dwServiceType != INTERNET_SERVICE_HTTP)
{
THROW(new CInternetException(dwServiceType));
}
pServer=MyConnect.GetHttpConnection(strServerName, nPort);
pHttpFile = pServer->OpenRequest(CHttpConnection::HTTP_VERB_GET,
strObject, NULL, 1, NULL, NULL);
pHttpFile->AddRequestHeaders(_T("Accept: text/*\r\nUser-Agent: MFC\r\n"));
pHttpFile->SendRequest();
DWORD StatusCode;
pHttpFile->QueryInfoStatusCode(StatusCode);
//file isn't there or is redirected
if (StatusCode == HTTP_STATUS_MOVED ||StatusCode == HTTP_STATUS_REDIRECT ||
StatusCode == HTTP_STATUS_REDIRECT_METHOD)
{
CString strNewLocation;
pHttpFile->QueryInfo(HTTP_QUERY_RAW_HEADERS_CRLF, strNewLocation);
int nPlace = strNewLocation.Find(_T("Location: "));
if (nPlace == -1)
{
THROW(new CInternetException(StatusCode));
}
strNewLocation = strNewLocation.Mid(nPlace + 10);
nPlace = strNewLocation.Find('\n');
if (nPlace > 0)
strNewLocation = strNewLocation.Left(nPlace);
// close up the redirected site
pHttpFile->Close();
delete pHttpFile;
pServer->Close();
delete pServer;
// figure out what the old place was
if (!AfxParseURL(strNewLocation, dwServiceType, strServerName, strObject, nPort))
{
THROW(new CInternetException(StatusCode));
}
if (dwServiceType != INTERNET_SERVICE_HTTP)
{
THROW(new CInternetException(StatusCode));
}
// try again at the new location
pServer = MyConnect.GetHttpConnection(strServerName, nPort);
pHttpFile = pServer->OpenRequest(CHttpConnection::HTTP_VERB_GET,
strObject, NULL, 1, NULL, NULL);
pHttpFile->AddRequestHeaders(_T("Accept: text/*\r\nUser-Agent: MFC\r\n"));
pHttpFile->SendRequest();
pHttpFile->QueryInfoStatusCode(StatusCode);
}
if (StatusCode != HTTP_STATUS_OK)
{
THROW(new CInternetException(StatusCode));
}
}catch(CInternetException *pEx)
{//出错处理
if(pServer!=NULL)
{
pServer->Close();
delete pServer;
}
if(pHttpFile!=NULL)
{
pHttpFile->Close();
delete pHttpFile;
}
pEx->Delete();
MyConnect.Close();
ptr->m_DownData.DeleThread();
pDlg->Add(URL+"\r\nDownload failed!\r\n",0);
return 0;
}
//message for "Connected"
//if m_DownData is not full save the file
bool b=!(ptr->m_DownData.IsFull());
CString str_FileName;
ptr->m_DownData.GetFileName(str_FileName);
//内容提取后保存为本地文件
CStdioFile LocalFile;
//本地临时文件
CStdioFile tempLocalFile;
LocalFile.Open(str_FileName,
CFile::modeCreate|CFile::modeWrite|CFile::typeText);
if(b)tempLocalFile.Open(str_FileName+".tmp",
CFile::modeCreate|CFile::modeWrite|CFile::typeText);
if(b)tempLocalFile.WriteString(URL+_T("\n"));
LocalFile.WriteString(URL+_T("\n"));
CString s;
UINT w=0,k=0;
int i=0,j=0;
LPTSTR p;
//找出本页的标题
while(pHttpFile->ReadString(s))
{//每次读取一行
if(b)tempLocalFile.WriteString(s+_T("\n"));
i=s.Find(_T("<TITLE>"));
if(i==-1) i=s.Find(_T("<Title>"));
if(i==-1) i=s.Find(_T("<title>"));
if(i!=-1)
{
i+=7;
j=s.Find(_T("</"),i);
if(j!=-1)LocalFile.WriteString(s.Mid(i,j-i)+_T("\n"));
else
{
LocalFile.WriteString(s.Mid(i));
while(pHttpFile->ReadString(s))
{
if(b)tempLocalFile.WriteString(s+_T("\n"));
j=s.Find(_T("</"));
if(j==-1)
{
LocalFile.WriteString(s);
continue;
}
LocalFile.WriteString(s.Left(j)+"\n");
break;
}
}
break;
}
if(s.Find(_T("</HAED>"))!=-1 || s.Find(_T("</Head>"))!=-1 || s.Find(_T("</head>"))!=-1)
{
LocalFile.WriteString(_T("Untitled Page\n"));
break;
}
};
bool body=false;
while(pHttpFile->ReadString(s))
{
if(b)tempLocalFile.WriteString(s+_T("\n"));
if(s.Find(_T("<BODY"))!=-1 || s.Find(_T("<body"))!=-1 || s.Find(_T("<Body"))!=-1)
body=true;
if(body && s.Find('>')!=-1)break;
}
//对主体内容的过滤
while(pHttpFile->ReadString(s))
{
if(b)tempLocalFile.WriteString(s+_T("\n"));
p=s.GetBuffer(1024);
ptr->TrimString(p,w,k,ptr->m_chinese);
s.ReleaseBuffer();
if(s!="")
{
int f=0,g=0;
//去除 符号
while((g=s.Find(_T(" "),f))!=-1)
{
s.Delete(g,6);
s.Insert(g, _T(" "));
f=g;
}
s.TrimLeft();
s.TrimRight();
if(s!="")
{
LocalFile.WriteString(s+_T(" "));
}
}
}
LocalFile.Close();
if(b)tempLocalFile.Close();
//查找临时文件中的链接
if(b)
{
FindURL(str_FileName+_T(".tmp"),ptr);
//删除临时文件
DeleteFile(str_FileName+_T(".tmp"));
}
pHttpFile->Close();
delete pHttpFile;
pServer->Close();
delete pServer;
MyConnect.Close();
//状态显示
pDlg->Add(URL+"\r\nDownload successfully!\r\n",1);
pDlg->m_ProgressBar.StepIt();
ptr->m_DownData.DeleThread();
return 1;
}
////////////////////////////////////////////////////////////////////////////
// MainThread message handlers
void MainThread::Run(CString &str_Begin)
{
//向共享数据区URL队列加入根URL
m_DownData.AddURL(str_Begin);
//访问根URL
if(m_DownData.AddThread())
{
AfxBeginThread(DownloadFile,this);
}
//依次启动工作者线程,根据共享数据区URL队列依次到指定URL下载
while(!m_bDone && !(m_DownData.IsEmpty() && m_DownData.GetCurThread()==0))
{
Sleep(100);
//判断全局变量ThreadPause,是否暂停线程
if(ThreadPause)continue;
if(m_DownData.AddThread())
{
AfxBeginThread(DownloadFile,this);
}
}
Sleep(1000);
AfxMessageBox(_T("任务完成!"));
ExitInstance();
}
//函数名称:TrimString
//函数功能描述:过滤掉字符串中的html语言标签
//函数的输入参数:
// LPTSTR pszBuffer 字符串指针指向被处理的字符串,以'\0'结尾
// UINT &w 已经出现的"<"数目
// UINT &K 已经出现的"{"数目
// bool chinese 是否主要保留中文
//函数的抽象算法:
// 对于html代码,出现在{}中间的被视为函数体会被无条件的删除
// 出现在<>中间的代码会当作语言标签被删除
// 如果是主要保留中文,为了更好的过滤,若一行中没有一个中文字符,则省略该行
bool MainThread::TrimString(LPTSTR pszBuffer,UINT &w,UINT &k,bool chinese)
{
LPTSTR pszSource = pszBuffer;
LPTSTR pszDest = pszBuffer;
LPTSTR pszTemp = pszBuffer;
bool ch=FALSE;
bool mark=FALSE;
while (*pszSource != '\0')
{
if (!ch && (*pszSource)<0)
{//本段字符中是否含有中文字符
ch=TRUE;
}
if (*pszSource == '{')k++;
if(k==0)
{//如果未被包含在{}中
if (w!=0)
{//如果包含在<>中
if (*pszSource == '>')
w--;
else if(*pszSource == '<')
{
w++;
}
}
else
{//未包含在<>中
if (*pszSource == '<')
{
w++;
mark=TRUE;
}
else
{
if(mark)
{//每段文字以空格分开
*pszDest=' ';
pszDest++;
mark=FALSE;
}
*pszDest = *pszSource;
pszDest++;
}
}
}
if (*pszSource == '}')k--;
pszSource++;
}
//结束处理
if(chinese)
{
if(ch)
{
*pszDest = '\0';
}
else
{
*pszTemp= '\0';
}
}
else
{
*pszDest = '\0';
}
return TRUE;
}