import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.ArrayList; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.File; /** * 作者:李兴球 * 从文本中提取链接,先读出文件内容,然后提取里面的链接 * 日期:2018/1/17 * 网址: http://www.scratch8.net/blog/ */ public class RegexLink { public static void main(String[] args) throws IOException { String html = ReadTextFile("c:\\test.txt","utf-8"); ArrayListurlList = new ArrayList (); urlList = extractUrls(html); String allLinks = String.join("\r\n", urlList); WriteTextFile("C:\\links.txt",allLinks,"utf-8"); } public static String ReadTextFile(String filePath,String encodeStyle) throws IOException { //读文本文件,指定文件编码UTF-8,GB2312 StringBuffer contents= new StringBuffer(); String line; try (BufferedReader br = new BufferedReader(new FileReader(filePath))) { while ((line = br.readLine()) != null) { contents.append(line); } } catch (IOException e) { e.printStackTrace(); } String result=new String(contents.toString().getBytes(),encodeStyle); return result; } public static void WriteTextFile(String filePath,String fileContent,String encodeStyle) throws IOException { //写文本文件,指定文件编码UTF-8,GB2312,经测试一定要写入中文才会生成UTF-8的文件,否则都是ANSI文件! //方法一 //StringBuffer buffer = new StringBuffer(fileContent); //FileOutputStream writerStream = new FileOutputStream(filePath); //BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(writerStream, encodeStyle)); //方法二 //OutputStreamWriter bf = new OutputStreamWriter(new FileOutputStream(filePath),encodeStyle); //bf.write(fileContent); //bf.close(); //方法三 PrintWriter out = new PrintWriter(new File(filePath), encodeStyle); out.print(fileContent); out.print("\n\n星空少儿编程测试代码\n"); out.flush(); out.close(); } /** * 返回文中的链接列表 */ public static ArrayList extractUrls(String text) { ArrayList containedUrls = new ArrayList (); String urlRegex = "((https?|ftp|gopher|telnet|file):((//)|(\\\\))+[\\w\\d:#@%/;$()~_?\\+-=\\\\\\.&]*)"; Pattern pattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE); Matcher urlMatcher = pattern.matcher(text); while (urlMatcher.find()) { containedUrls.add(text.substring(urlMatcher.start(0), urlMatcher.end(0))); } return containedUrls; } }
李兴球
李兴球的博客是Python创意编程原创博客
要发表评论,您必须先登录。
发表评论