A few years ago while working on a web-scraping tool in .NET I found an amazing library SgmlReader which made my life so easy to convert HTML documents to XHTML. With this I was able to run XPATH queries to extract whatever information I want from any dam website written in worst possible malformed HTML. Had it not been SgmlReader I would have had to write tedious parsing  code to extract the tokens from the HTML string.

With this simple code you cleanup the mess that most web-masters do !!

Here is the function for VB.NET. Please download SgmlReader from the link above.

    Public Function Html2Xml(ByVal txtHTMLString As String) As String
        Dim XHTML As New Sgml.SgmlReader
        Dim sw As StringWriter = New StringWriter()
        Dim w As XmlTextWriter = New XmlTextWriter(sw)
 
        XHTML.DocType = "HTML"
        XHTML.InputStream = New StringReader(txtHTMLString)
 
        While (Not XHTML.EOF)
            w.WriteNode(XHTML, True)
        End While
        w.Close()
 
        Return sw.ToString()
 
    End Function

 

Recently I encountered a similar need in PHP and I was desperately searching for SgmlReader equivalent and my search zeroed on php_tidy extension. Once you enable this extension you get all the functionality.

	$opts = array("clean" => true, "output-xml" => true); 	
	$xhtml = tidy_parse_file("http://www.example.com", $opts);
	echo $xhtml;

 

For more information about php_tidy goto http://us.php.net/tidy

 

CURL Class with cookie support

On September 15, 2009, in PHP, by Vaibhav

If found a really nice PHP class for CURL usage
Source: http://us2.php.net/manual/en/book.curl.php#90821

I have modified the code for my needs.

Example Usage:

$curl = new mycurl("http://www.example.com");
$curl->setPost($example_post_data);
$curl->createCurl('nul');
$content = $curl->content();
echo $content; 

 

Here the code code for the curl class:

class mycurl { 
	protected $_useragent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2"; 
	protected $_url; 
	protected $_followlocation; 
	protected $_timeout; 
	protected $_maxRedirects; 
	protected $_cookieFileLocation = './cookie.txt'; 
	protected $_post; 
	protected $_postFields; 
	protected $_referer ="http://www.google.com"; 

	protected $_session; 
	protected $_webpage; 
	protected $_includeHeader; 
	protected $_noBody; 
	protected $_status; 
	protected $_binaryTransfer; 
	public    $authentication = 0; 
	public    $auth_name      = ''; 
	public    $auth_pass      = ''; 

     public function useAuth($use){ 
       $this->authentication = 0; 
       if($use == true) $this->authentication = 1; 
     } 

     public function setName($name){ 
       $this->auth_name = $name; 
     } 
     public function setPass($pass){ 
       $this->auth_pass = $pass; 
     } 

     public function __construct($url,$followlocation = true,$timeOut = 30,$maxRedirecs = 4,$binaryTransfer = false,$includeHeader = false,$noBody = false) 
     { 
         $this->_url = $url; 
         $this->_followlocation = $followlocation; 
         $this->_timeout = $timeOut; 
         $this->_maxRedirects = $maxRedirecs; 
         $this->_noBody = $noBody; 
         $this->_includeHeader = $includeHeader; 
         $this->_binaryTransfer = $binaryTransfer; 

         $this->_cookieFileLocation = dirname(__FILE__).'/cookie.txt'; 

     } 

     public function setReferer($referer){ 
       $this->_referer = $referer; 
     } 

     public function setCookiFileLocation($path) 
     { 
         $this->_cookieFileLocation = $path; 
     } 

     public function setPost ($postFields) 
     { 
        $this->_post = true; 
        $this->_postFields = $postFields; 
     } 

     public function setUserAgent($userAgent) 
     { 
         $this->_useragent = $userAgent; 
     } 

     public function createCurl($url = 'nul') 
     { 
        if($url != 'nul'){ 
          $this->_url = $url; 
        } 

         $s = curl_init(); 

		curl_setopt($s,CURLOPT_URL,$this->_url); 
		curl_setopt($s,CURLOPT_HTTPHEADER,array('Expect:')); 
		curl_setopt($s,CURLOPT_TIMEOUT,$this->_timeout); 
		curl_setopt($s,CURLOPT_MAXREDIRS,$this->_maxRedirects); 
		curl_setopt($s,CURLOPT_RETURNTRANSFER,true); 
		curl_setopt($s,CURLOPT_FOLLOWLOCATION,$this->_followlocation); 
		curl_setopt($s,CURLOPT_COOKIEJAR,$this->_cookieFileLocation); 
		curl_setopt($s,CURLOPT_COOKIEFILE,$this->_cookieFileLocation); 
		$header[] = 'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5';
		$header[] = 'Accept-Language: en-us,en;q=0.5'; 
		$header[] = 'Accept-Encoding: gzip,deflate'; 
		$header[] = 'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7'; 


		$header[] = 'Keep-Alive: 300'; 
		$header[] = 'Connection: keep-alive'; 


         curl_setopt($s,CURLOPT_HTTPHEADER, $header); 
		 

         if($this->authentication == 1){ 
           curl_setopt($s, CURLOPT_USERPWD, $this->auth_name.':'.$this->auth_pass); 
         } 
         if($this->_post) 
         { 
             curl_setopt($s,CURLOPT_POST,true); 
             curl_setopt($s,CURLOPT_POSTFIELDS,$this->_postFields); 
         } 

         if($this->_includeHeader) 
         { 
               curl_setopt($s,CURLOPT_HEADER,true); 
         } 

         if($this->_noBody) 
         { 
             curl_setopt($s,CURLOPT_NOBODY,true); 
         } 
          
         /*if($this->_binary) 
         { 
             curl_setopt($s,CURLOPT_BINARYTRANSFER,true); 
         } */
         
         curl_setopt($s,CURLOPT_USERAGENT,$this->_useragent); 
         curl_setopt($s,CURLOPT_REFERER,$this->_referer); 
		 curl_setopt($s, CURLINFO_HEADER_OUT, true);

         $this->_webpage = curl_exec($s); 

		 var_dump(curl_getinfo($s,CURLINFO_HEADER_OUT));


         $this->_status = curl_getinfo($s,CURLINFO_HTTP_CODE); 
         curl_close($s); 

     } 

	public function getHttpStatus() 
	{ 
		return $this->_status; 
	} 

	public function content()
	{ 
		return $this->_webpage; 
	} 
}