import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.File;
/**
* 作者:李兴球
* 从文本中提取链接,先读出文件内容,然后提取里面的链接
* 日期:2018/1/17
* 网址: http://www.scratch8.net/blog/
*/
public class RegexLink
{
public static void main(String[] args) throws IOException
{
String html = ReadTextFile("c:\\test.txt","utf-8");
ArrayList urlList = new ArrayList();
urlList = extractUrls(html);
String allLinks = String.join("\r\n", urlList);
WriteTextFile("C:\\links.txt",allLinks,"utf-8");
}
public static String ReadTextFile(String filePath,String encodeStyle) throws IOException
{
//读文本文件,指定文件编码UTF-8,GB2312
StringBuffer contents= new StringBuffer();
String line;
try (BufferedReader br = new BufferedReader(new FileReader(filePath)))
{
while ((line = br.readLine()) != null)
{
contents.append(line);
}
} catch (IOException e) { e.printStackTrace(); }
String result=new String(contents.toString().getBytes(),encodeStyle);
return result;
}
public static void WriteTextFile(String filePath,String fileContent,String encodeStyle) throws IOException
{
//写文本文件,指定文件编码UTF-8,GB2312,经测试一定要写入中文才会生成UTF-8的文件,否则都是ANSI文件!
//方法一
//StringBuffer buffer = new StringBuffer(fileContent);
//FileOutputStream writerStream = new FileOutputStream(filePath);
//BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(writerStream, encodeStyle));
//方法二
//OutputStreamWriter bf = new OutputStreamWriter(new FileOutputStream(filePath),encodeStyle);
//bf.write(fileContent);
//bf.close();
//方法三
PrintWriter out = new PrintWriter(new File(filePath), encodeStyle);
out.print(fileContent);
out.print("\n\n星空少儿编程测试代码\n");
out.flush();
out.close();
}
/**
* 返回文中的链接列表
*/
public static ArrayList extractUrls(String text)
{
ArrayList containedUrls = new ArrayList();
String urlRegex = "((https?|ftp|gopher|telnet|file):((//)|(\\\\))+[\\w\\d:#@%/;$()~_?\\+-=\\\\\\.&]*)";
Pattern pattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE);
Matcher urlMatcher = pattern.matcher(text);
while (urlMatcher.find())
{
containedUrls.add(text.substring(urlMatcher.start(0),
urlMatcher.end(0)));
}
return containedUrls;
}
}