java program to read text from html file


  • import java.io.FileReader;
  • import java.io.IOException;
  • import java.io.InputStreamReader;
  • import java.io.Reader;
  • import java.net.URL;
  • import java.net.URLConnection;

  • import java.io.*;
  • import java.net.*;
  • import javax.swing.text.*;
  • import javax.swing.text.html.*;

  • public class ReadTextFromHTML {

  • public static void main(String[] args) throws Exception{
  •  EditorKit kit = new HTMLEditorKit();
  •         Document doc = kit.createDefaultDocument();

  •         // The Document class does not yet handle charset's properly.
  •         doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);

  •         // Create a reader on the HTML content.

  •         Reader rd = getReader("http://theprogrammersfirst.blogspot.com/2016/06/javalangexception-error-getting-all.html");

  •         // Parse the HTML.

  •         kit.read(rd, doc, 0);

  •         //  The HTML text is now stored in the document
  •         String s=(doc.getText(0, doc.getLength())).trim().replaceAll(" \n+", " ");
  •         System.out.println(s);

  • }
  • static Reader getReader(String uri)
  •         throws IOException
  •     {

  •         // Retrieve from Internet.
  •         if (uri.startsWith("http:"))
  •         {System.out.println(uri);
  •             URLConnection conn = new URL(uri).openConnection();
  •             return new InputStreamReader(conn.getInputStream());
  •         }
  •         // Retrieve from file.
  •         else
  •         {
  •             return new FileReader(uri);
  •         }
  •     }
  • }


Comments

Popular posts from this blog

Today Walkin 14th-Sept

Spring Elasticsearch Operations

Hibernate Search - Elasticsearch with JSON manipulation