| Displaying  Source Code(s)  
 
 
          
            |  |  |  
            | 
              
                | diffrent method of converting web page(html2text) 
 --------------------------------------------------------------------------------
 
 Description : This project helps to have text of any web page 
                like A to Z Amazon to Google to Yahoo any one.
 
 Code :
 import java.io.File;
 import java.io.FileInputStream;
 
 import java.io.Reader;
 import java.io.StringReader;
 
 import java.io.IOException;
 
 /**
 * Convert text/html into text/plain
 *
 * Auther: Omindra Kumar Rana
 * Email: rana_omindra@yahoo.co.in
 *
 * @version 1.0 $Date: May 10, 2005 $
 */
 
 public class HTML2Text
 {
 boolean body_found = false;
 boolean in_body = false;
 boolean center = false;
 boolean pre = false;
 String href = "";
 
 public String convert(String source) throws Exception
 {
 StringBuffer result = new StringBuffer();
 StringBuffer result2 = new StringBuffer();
 StringReader input = new StringReader(source);
 
 try
 {
 String text = null;
 int c = input.read();
 
 while (c != -1) // Convert until EOF
 {
 text = "";
 if (c == '<') // It's a tag!!
 {
 String CurrentTag = getTag(input); // Get the rest of the tag
 text = convertTag(CurrentTag);
 }
 else if (c == '&')
 {
 String specialchar = getSpecial(input);
 if (specialchar.equals("lt;") || specialchar.equals("#60"))
 text = "<";
 else if (specialchar.equals("gt;") || specialchar.equals("#62"))
 text = ">";
 else if (specialchar.equals("amp;") || specialchar.equals("#38"))
 text = "&";
 else if (specialchar.equals("nbsp;"))
 text = " ";
 else if (specialchar.equals("quot;") || specialchar.equals("#34"))
 text = """;
 else if (specialchar.equals("copy;") || specialchar.equals("#169"))
 text = "[Copyright]";
 else if (specialchar.equals("reg;") || specialchar.equals("#174"))
 text = "[Registered]";
 else if (specialchar.equals("trade;") || specialchar.equals("#153"))
 text = "[Trademark]";
 else
 text = "&" + specialchar;
 }
 else if (!pre && Character.isWhitespace((char)c))
 {
 StringBuffer s = in_body ? result : result2;
 if (s.length() > 0 && Character.isWhitespace(s.charAt(s.length()-1)))
 text = "";
 else text = " ";
 }
 else
 {
 text = "" + (char)c;
 }
 
 StringBuffer s = in_body ? result : result2;
 s.append(text);
 
 c = input.read();
 }
 }
 catch (Exception e)
 {
 input.close();
 throw e;
 }
 
 StringBuffer s = body_found ? result : result2;
 return s.toString().trim();
 }
 
 String getTag(Reader r) throws IOException
 {
 StringBuffer result = new StringBuffer();
 int level = 1;
 
 result.append('<');
 while (level > 0)
 {
 int c = r.read();
 if (c == -1) break; // EOF
 result.append((char)c);
 if (c == '<') level++; else if (c == '>') level--;
 }
 
 return result.toString();
 }
 
 String getSpecial(Reader r) throws IOException
 {
 StringBuffer result = new StringBuffer();
 r.mark(1);//Mark the present position in the stream
 int c = r.read();
 
 while (Character.isLetter((char)c))
 {
 result.append((char)c);
 r.mark(1);
 c = r.read();
 }
 
 if (c == ';') result.append(';');
 else r.reset();
 
 return result.toString();
 }
 
 boolean isTag(String s1, String s2)
 {
 s1 = s1.toLowerCase();
 String t1 = "<" + s2.toLowerCase() + ">";
 String t2 = "<" + s2.toLowerCase() + " ";
 
 return s1.startsWith(t1) || s1.startsWith(t2);
 }
 
 String convertTag(String t) throws IOException
 {
 String result = "";
 
 if (isTag(t,"body"))
 { in_body = true; body_found = true; }
 else if (isTag(t,"/body"))
 { in_body = false; result = "<BR>; }
 else if (isTag(t,"center"))
 { result = "<BR>; center = true; }
 else if (isTag(t,"/center"))
 { result = "<BR>; center = false; }
 else if (isTag(t,"pre"))
 { result = "<BR>; pre = true; }
 else if (isTag(t,"/pre"))
 { result = "<BR>; pre = false; }
 else if (isTag(t,"p"))
 result = "
 <BR>;
 else if (isTag(t,"br"))
 result = "<BR>;
 else if (isTag(t,"h1") || isTag(t,"h2") ||
 isTag(t,"h3") ||isTag(t,"h4") || isTag(t,"h5") || isTag(t,"h6") 
                ||
 isTag(t,"h7"))
 result = "<BR>;
 else if (isTag(t,"/h1") || isTag(t,"/h2") ||
 isTag(t,"/h3") ||isTag(t,"/h4") || isTag(t,"/h5") || isTag(t,"/h6") 
                ||
 isTag(t,"/h7"))
 result = "<BR>;
 else if (isTag(t,"/dl"))
 result = "<BR>;
 else if (isTag(t,"dd"))
 result = "
 * ";
 else if (isTag(t,"dt"))
 result = " ";
 else if (isTag(t,"li"))
 result = "
 * ";
 else if (isTag(t,"/ul"))
 result = "<BR>;
 else if (isTag(t,"/ol"))
 result = "<BR>;
 else if (isTag(t,"hr"))
 result = "_________________________________________
 <BR>;
 else if (isTag(t,"table"))
 result = "<BR>;
 else if (isTag(t,"/table"))
 result = "<BR>;
 else if (isTag(t,"form"))
 result = "<BR>;
 else if (isTag(t,"/form"))
 result = "<BR>;
 else if (isTag(t,"b"))
 result = "*";
 else if (isTag(t,"/b"))
 result = "*";
 else if (isTag(t,"i"))
 result = """;
 else if (isTag(t,"/i"))
 result = """;
 else if (isTag(t,"img"))
 {
 int idx = t.indexOf("alt="");
 if (idx != -1)
 {
 idx += 5;
 int idx2 = t.indexOf(""",idx);
 result = t.substring(idx,idx2);
 }
 }
 else if (isTag(t,"a"))
 {
 int idx = t.indexOf("href="");
 if (idx != -1)
 {
 idx += 6;
 int idx2 = t.indexOf(""",idx);
 href = t.substring(idx,idx2);
 }
 else
 {
 href = "";
 }
 }
 else if (isTag(t,"/a"))
 {
 if (href.length() > 0)
 {
 result = " [ " + href + " ]";
 href = "";
 }
 }
 
 return result;
 }
 
 public static void main(String argv[]) throws Exception
 {
 FileInputStream fis = null;
 String s = null;
 
 try
 {
 File file;
 if (argv[0] != null) file = new File(argv[0]);
 else file = new File("html_test_file.html");
 fis = new FileInputStream(file);
 byte buf[] = new byte[fis.available()];
 //bytes that can be read from this file input stream without 
                blocking
 
 fis.read(buf);
 fis.close();
 fis = null;
 s = new String(buf);
 HTML2Text h = new HTML2Text();
 System.out.println(h.convert(s));
 }
 catch (Exception e)
 {
 if (fis != null) fis.close();
 throw e;
 }
 }
 }
 |  |  |