java使用正则表达式从文本中提取链接

发表于20 1 月, 2018由李兴球

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;

import java.io.FileOutputStream;
import java.io.OutputStreamWriter;

import java.io.PrintWriter;
import java.io.File;

  /**
   *  作者:李兴球
   *  从文本中提取链接,先读出文件内容,然后提取里面的链接
   *  日期:2018/1/17
   *  网址: http://www.scratch8.net/blog/
   */
public class RegexLink
{
 
   public static void main(String[] args) throws IOException
   {
     String html =  ReadTextFile("c:\\test.txt","utf-8");
      
      ArrayList urlList = new ArrayList();
     urlList = extractUrls(html);

     String allLinks = String.join("\r\n", urlList);
  
      WriteTextFile("C:\\links.txt",allLinks,"utf-8");

  }

  public static String ReadTextFile(String filePath,String encodeStyle) throws IOException
  {
        //读文本文件,指定文件编码UTF-8,GB2312
        StringBuffer contents= new StringBuffer();
        String line;

        try (BufferedReader br = new BufferedReader(new FileReader(filePath)))
        {
           while ((line = br.readLine()) != null)
           {
                contents.append(line);
            }
         } catch (IOException e) {   e.printStackTrace();    }
        String result=new String(contents.toString().getBytes(),encodeStyle);
        return result;


  }


  public static void WriteTextFile(String filePath,String fileContent,String encodeStyle) throws IOException
  {
        //写文本文件,指定文件编码UTF-8,GB2312,经测试一定要写入中文才会生成UTF-8的文件,否则都是ANSI文件!

        //方法一
        //StringBuffer buffer = new StringBuffer(fileContent);
        //FileOutputStream writerStream = new FileOutputStream(filePath);    
        //BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(writerStream, encodeStyle)); 

         //方法二
        //OutputStreamWriter bf = new OutputStreamWriter(new FileOutputStream(filePath),encodeStyle);
        //bf.write(fileContent);
        //bf.close();

        //方法三
        PrintWriter out = new PrintWriter(new File(filePath), encodeStyle);
        out.print(fileContent);
        out.print("\n\n星空少儿编程测试代码\n");
        out.flush();
        out.close();


  }




  /**
   *  返回文中的链接列表
   */
  public static ArrayList extractUrls(String text)
  {
    ArrayList containedUrls = new ArrayList();
    String urlRegex = "((https?|ftp|gopher|telnet|file):((//)|(\\\\))+[\\w\\d:#@%/;$()~_?\\+-=\\\\\\.&]*)";
    Pattern pattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE);
    Matcher urlMatcher = pattern.matcher(text);

    while (urlMatcher.find())
    {
        containedUrls.add(text.substring(urlMatcher.start(0),
                urlMatcher.end(0)));
    }

    return containedUrls;
  }

}

关于李兴球

李兴球的博客是Python创意编程原创博客

查看由李兴球发表的所有文章 →

此条目发表在Uncategorized分类目录，贴了java link extract, java少儿编程, java提取链接, java正则, java获取源代码标签。将固定链接加入收藏夹。

发表回复取消回复

要发表评论，您必须先登录。