`

单线程蜘蛛查找死链接

阅读更多

截图效果:

 

源程序分别有四个文件:
CheckLinks.java

package org.qyclass;
import java.awt.*;
import javax.swing.*;
import java.net.*;
import java.io.*;
public class CheckLinks extends javax.swing.JFrame implements Runnable, ISpiderReportable {
public CheckLinks() {
//{{INIT_CONTROLS
setTitle("找到死链接");
getContentPane().setLayout(null);
setSize(405, 288);
setVisible(false);
label1.setText("输入一个URL:");
getContentPane().add(label1);
label1.setBounds(12, 12, 84, 12);
begin.setText("Begin");
begin.setActionCommand("Begin");
getContentPane().add(begin);
begin.setBounds(12, 36, 84, 24);
getContentPane().add(url);
url.setBounds(108, 36, 288, 24);
errorScroll.setAutoscrolls(true);
errorScroll.setHorizontalScrollBarPolicy(javax.swing.ScrollPaneC*****tants.HORIZONTAL_SCROLLBAR_ALWAYS);
errorScroll.setVerticalScrollBarPolicy(javax.swing.ScrollPaneC*****tants.VERTICAL_SCROLLBAR_ALWAYS);
errorScroll.setOpaque(true);
getContentPane().add(errorScroll);
errorScroll.setBounds(12, 120, 384, 156);
errors.setEditable(false);
errorScroll.getViewport().add(errors);
errors.setBounds(0, 0, 366, 138);
current.setText("当前处理进度:");
getContentPane().add(current);
current.setBounds(12, 72, 384, 12);
goodLinksLabel.setText("正常链接:0");
getContentPane().add(goodLinksLabel);
goodLinksLabel.setBounds(12, 96, 192, 12);
badLinksLabel.setText("死链接:0");
getContentPane().add(badLinksLabel);
badLinksLabel.setBounds(216, 96, 96, 12);
//}}
//{{INIT_MENUS
//}}
//{{REGISTER_LISTENERS
SymAction lSymAction = new SymAction();
begin.addActionListener(lSymAction);
//}}
}
/**
*参数args未使用
*/
static public void main(String args[]) {
(new CheckLinks()).setVisible(true);
}
/**
*添加通知
*/
@Override
public void addNotify() {
//记录窗口尺寸并调用父类的addNotify.
Dimension size = getSize();
super.addNotify();
if (frameSizeAdjusted) {
return;
}
frameSizeAdjusted = true;
//根据菜单栏等调整Frame尺寸
Insets insets = getInsets();
javax.swing.JMenuBar menuBar = getRootPane().getJMenuBar();
int menuBarHeight = 0;
if (menuBar != null) {
menuBarHeight = menuBar.getPreferredSize().height;
}
setSize(insets.left + insets.right + size.width, insets.top +
insets.bottom + size.height +
menuBarHeight);
}
boolean frameSizeAdjusted = false;
//{{
javax.swing.JLabel label1 = new javax.swing.JLabel();
javax.swing.JButton begin = new javax.swing.JButton();
javax.swing.JTextField url = new javax.swing.JTextField();
javax.swing.JScrollPane errorScroll = new javax.swing.JScrollPane();
/**
*存储错误信息
*/
javax.swing.JTextArea errors = new javax.swing.JTextArea();
javax.swing.JLabel current = new javax.swing.JLabel();
javax.swing.JLabel goodLinksLabel = new javax.swing.JLabel();
javax.swing.JLabel badLinksLabel = new javax.swing.JLabel();
//}}
//{{
//
/**
*后台蜘蛛线程
*/
protected Thread backgroundThread;
protected Spider spider;
protected URL base;
protected int badLinksCount = 0;
protected int goodLinksCount = 0;
/**
*用于分发事件的内部类
*/
class SymAction implements java.awt.event.ActionListener {
public void actionPerformed(java.awt.event.ActionEvent event) {
Object object = event.getSource();
if (object == begin) {
begin_actionPerformed(event);
}
}
}
/**
*当begin或cancel按钮被点击时调用
*
*参数event与按钮相连
*/
void begin_actionPerformed(java.awt.event.ActionEvent event) {
if (backgroundThread == null) {
begin.setLabel("Cancel");
backgroundThread = new Thread(this);
backgroundThread.start();
goodLinksCount = 0;
badLinksCount = 0;
} else {
spider.cancel();
}
}
/**
*执行后台线程操作
*/
public void run() {
try {
errors.setText("");
spider = new Spider(this);
spider.clear();
base = new URL(url.getText());
spider.addURL(base);
spider.begin();
Runnable doLater = new Runnable() {
public void run() {
begin.setText("Begin");
}
};
SwingUtilities.invokeLater(doLater);
backgroundThread = null;
} catch (MalformedURLException e) {
UpdateErrors err = new UpdateErrors();
err.msg = "错误地址。";
SwingUtilities.invokeLater(err);
}
}
/**
*当找到某一URL时由蜘蛛调用,在此验证链接。
*
*参数base是找到链接时的页面
*参数url是链接地址
*/
public boolean spiderFoundURL(URL base, URL url) {
UpdateCurrentStats cs = new UpdateCurrentStats();
cs.msg = url.toString();
SwingUtilities.invokeLater(cs);
if (!checkLink(url)) {
UpdateErrors err = new UpdateErrors();
err.msg = url + "(on page " + base + ")\n";
SwingUtilities.invokeLater(err);
badLinksCount++;
return false;
}
UpdateErrors err = new UpdateErrors();
err.msg = url.toString()+"\n";
SwingUtilities.invokeLater(err);
goodLinksCount++;
if (!url.getHost().equalsIgnoreCase(base.getHost())) {
return false;
} else {
return true;
}
}
/**
*当发现URL错误时调用
*
*参数url是导致错误的URL
*/
public void spiderURLError(URL url) {
}
/**
*由内部调用检查链接是否有效
*
*参数url是被检查的链接
*返回True表示链接正常有效
*/
protected boolean checkLink(URL url) {
try {
URLConnection connection = url.openConnection();
connection.connect();
return true;
} catch (IOException e) {
return false;
}
}
/**
*当蜘蛛找到电子邮件地址时调用
*
*参数email为找到的电子邮件地址
*/
public void spiderFoundEMail(String email) {
}
/**
*以线程安全方式更新错误信息的内部类
*/
class UpdateErrors implements Runnable {
public String msg;
public void run() {
errors.append(msg);
}
}
/**
*以线程安全方式更新当前状态信息
*/
class UpdateCurrentStats implements Runnable {
public String msg;
public void run() {
current.setText("当前进度:" + msg);
goodLinksLabel.setText("正常链接:" + goodLinksCount);
badLinksLabel.setText("死链接:" + badLinksCount);
}
}
}


HTMLParse.java


/*

* To change this template, choose Tools | Templates

* and open the template in the editor.

*/

package org.qyclass;

import javax.swing.text.html.*;

public class HTMLParse extends HTMLEditorKit {

public HTMLEditorKit.Parser getParser()

{

return super.getParser();

}

}
复制代码
ISpiderReportable.java

/*

* To change this template, choose Tools | Templates

* and open the template in the editor.

*/

package org.qyclass;

import java.net.*;

interface ISpiderReportable {

public boolean spiderFoundURL(URL base,URL url);

public void spiderURLError(URL url);

public void spiderFoundEMail(String email);

}
复制代码
Spider.java

/*

* To change this template, choose Tools | Templates

* and open the template in the editor.

*/

package org.qyclass;

import java.util.*;

import java.net.*;

import java.io.*;

import javax.swing.text.*;

import javax.swing.text.html.*;

 

public class Spider {

/**

   *导致错误的URL集合

&

nbsp;  */

protected Collection workloadError = new ArrayList(3);

/**

   *等待区URL集合

   */

protected Collection workloadWaiting = new ArrayList(3);

/**

   *处理过的URL集合

   */

protected Collection workloadProcessed = new ArrayList(3);

protected ISpiderReportable report;

/**

   *表明处理过程是否应取消的标志

   */

protected boolean cancel = false;

/**

   *构造函数

   *

   *参数report为实现了ISpiderReportable接口的类

   */

public Spider(ISpiderReportable report)

{

    this.report = report;

}

/**

   *获取导致错误的URL

   */

public Collection getWorkloadError()

{

    return workloadError;

}

/**

   *获取在等待的URL

   *应添加至少一个URL到此集合以启动蜘蛛

   */

public Collection getWorkloadWaiting()

{

    return workloadWaiting;

}

/**

   *获取被处理过的URL

   */

public Collection getWorkloadProcessed()

{

    return workloadProcessed;

}

/**

   *清空所有

   */

public void clear()

{

    getWorkloadError().clear();

    getWorkloadWaiting().clear();

    getWorkloadProcessed().clear();

}

/**

   *设置一标志,使begin方法在完成之前返回

   */

public void cancel()

{

    cancel = true;

}

 

public void addURL(URL url)

{

    if ( getWorkloadWaiting().contains(url) )

      return;

    if ( getWorkloadError().contains(url) )

      return;

    if ( getWorkloadProcessed().contains(url) )

      return;

    log("正添加到工作区:" + url );

    getWorkloadWaiting().add(url);

}

 

public void processURL(URL url)

{

    try {

      log("正在处理:" + url );

      //获取URL的内容

      URLConnection connection = url.openConnection();

      if ( (connection.getContentType()!=null) &&

           !connection.getContentType().toLowerCase().startsWith("text/") ) {

        getWorkloadWaiting().remove(url);

        getWorkloadProcessed().add(url);

        log("不会进行正理,因为类型为:" +

             connection.getContentType() );

        return;

      }

      //读取URL

      InputStream is = connection.getInputStream();

      Reader r = new InputStreamReader(is);

      //解析URL

      HTMLEditorKit.Parser parse = new HTMLParse().getParser();

      parse.parse(r,new Parser(url),true);

    } catch ( IOException e ) {

      getWorkloadWaiting().remove(url);

      getWorkloadError().add(url);

      log("错误:" + url );

      report.spiderURLError(url);

      return;

    }

    //标记此URL已完成

    getWorkloadWaiting().remove(url);

    getWorkloadProcessed().add(url);

    log("已完成:" + url );

  }

 

public void begin()

{

    cancel = false;

    while ( !getWorkloadWaiting().isEmpty() && !cancel ) {

      Object list[] = getWorkloadWaiting().toArray();

      for ( int i=0;(i<list.length)&&!cancel;i++ )

        processURL((URL)list[i]);

    }

}

/**

*HTML解析器回调函数

*/

protected class Parser

extends HTMLEditorKit.ParserCallback {

    protected URL base;

    public Parser(URL base)

    {

      this.base = base;

    }

    public void handleSimpleTag(HTML.Tag t,

                                MutableAttributeSet a,int pos)

    {

      String href = (String)a.getAttribute(HTML.Attribute.HREF);

      if( (href==null) && (t==HTML.Tag.FRAME) )

        href = (String)a.getAttribute(HTML.Attribute.SRC);

      if ( href==null )

          return;

      int i = href.indexOf("#");

      if ( i!=-1 )

        href = href.substring(0,i);

      if ( href.toLowerCase().startsWith("mailto:") ) {

        report.spiderFoundEMail(href);

      return;

      }

      handleLink(base,href);

    }

    public void handleStartTag(HTML.Tag t,

                               MutableAttributeSet a,int pos)

    {

      handleSimpleTag(t,a,pos);//以同样的方式处理

    }

    protected void handleLink(URL base,String str)

    {

      try {

        URL url = new URL(base,str);

        if ( report.spiderFoundURL(base,url) )

          addURL(url);

      } catch ( MalformedURLException e ) {

        log("找到畸形URL:" + str );

      }

    }

}

/**

   *由内部调用来记录信息

   *仅是把日志写到标准输出

   *

   *参数entry为写到日志的信息

   */

public void log(String entry)

{

    System.out.println( (new Date()) + ":" + entry );

}

}
原帖地址:http://www.phpjava.org/thread-81-1-1.html

本文来自: PJDN--php&Java论坛|技术交流社区,打造中国php&java开发者社区[www.phpjava.org]

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics