<?php

/*
 * Author				: Ivo Hunink
 * company/copyright	: DataCT    
 * date created			: 01-11-07
 * date modified		: $Id$
 */

include("sphiderIncludes/conf.php");
include("sphiderIncludes/database.php");

// BEGIN ---------------------------------------------------------------------------------- CATEGORY FUNCTIONS 
function get_categories_view() {
	global $mysql_table_prefix;
	$categories['main_list'] = sql_fetch_all('SELECT * FROM '.$mysql_table_prefix.'categories WHERE parent_num=0 ORDER BY category');
		
	if (is_array($categories['main_list'])) {
		foreach ($categories['main_list'] as $_key => $_val) {
			$categories['main_list'][$_key]['sub'] =  sql_fetch_all('SELECT * FROM '.$mysql_table_prefix.'categories WHERE parent_num='.$_val['category_id']);
		}
	}
	return $categories;
}

function get_category_info($catid) {
	global $mysql_table_prefix;
	$categories['main_list'] = sql_fetch_all("SELECT * FROM ".$mysql_table_prefix."categories ORDER BY category");
	
	if (is_array($categories['main_list'])) {
		foreach($categories['main_list'] as $_val) {
			$categories['categories'][$_val['category_id']] = $_val;
			$categories['subcats'][$_val['parent_num']][] = $_val;
		}
	}
	
	$categories['subcats'] = $categories['subcats'][$_REQUEST['catid']];
	
	/* count sites */
	if (is_array($categories['subcats'])) {
		foreach ($categories['subcats'] as $_key => $_val) {
			$categories['subcats'][$_key]['count'] = sql_fetch_all('SELECT count(*) FROM '.$mysql_table_prefix.'site_category WHERE 	category_id='.(int)$_val['category_id']);
		}
	}
		
	/* make tree */	
	$_parent = $catid;
	while ($_parent) {
		$categories['cat_tree'][] = $categories['categories'][$_parent];
		$_parent = $categories['categories'][$_parent]['parent_num'];
	}
	$categories['cat_tree'] = array_reverse($categories['cat_tree']);
	
	
	/* list category sites */
	$categories['cat_sites'] = sql_fetch_all('SELECT url, title, short_desc FROM '.$mysql_table_prefix.'sites, '.$mysql_table_prefix.'site_category WHERE category_id='.$catid.' AND '.$mysql_table_prefix.'sites.site_id='.$mysql_table_prefix.'site_category.site_id order by title');
	
	return $categories;
}
// END  ---------------------------------------------------------------------------------- CATEGORY FUNCTIONS 

// BEGIN  ---------------------------------------------------------------------------------- COMMON FUNCTIONS 

	/**
	* Returns the result of a query as an array
	* 
	* @param string $query SQL pring stringina
	* @return array|null massiiv
	 */
	function sql_fetch_all($query) {
		$result = mysql_query($query);
		if($mysql_err = mysql_errno()) {
			print $query.'<br>'.mysql_error();
		} else {
			while($row=mysql_fetch_array($result)) {
				$data[]=$row;
			}	
		}		
		return $data;
	}



	/*
	Removes duplicate elements from an array
	*/
	function distinct_array($arr) {
		rsort($arr);
		reset($arr);
		$newarr = array();
		$i = 0;
		$element = current($arr);

		for ($n = 0; $n < sizeof($arr); $n++) {
			if (next($arr) != $element) {
				$newarr[$i] = $element;
				$element = current($arr);
				$i++;
			}
		}

		return $newarr;
	}

	function get_cats($parent) {
		global $mysql_table_prefix;
		$query = "SELECT * FROM ".$mysql_table_prefix."categories WHERE parent_num=$parent";
		echo mysql_error();
		$result = mysql_query($query);
		$arr[] = $parent;
		if (mysql_num_rows($result) <> '') {
			while ($row = mysql_fetch_array($result)) {
				$id = $row[category_id];
				$arr = add_arrays($arr, get_cats($id));
			}
		}

		return $arr;
	}
	
	function add_arrays($arr1, $arr2) {
		foreach ($arr2 as $elem) {
			$arr1[] = $elem;
		}
		return $arr1;
	}

	$entities = array
		(
		"&amp" => "&",
		"&apos" => "'",
		"&THORN;"  => "",
		"&szlig;"  => "",
		"&agrave;" => "",
		"&aacute;" => "",
		"&acirc;"  => "",
		"&atilde;" => "",
		"&auml;"   => "",
		"&aring;"  => "",
		"&aelig;"  => "",
		"&ccedil;" => "",
		"&egrave;" => "",
		"&eacute;" => "",
		"&ecirc;"  => "",
		"&euml;"   => "",
		"&igrave;" => "",
		"&iacute;" => "",
		"&icirc;"  => "",
		"&iuml;"   => "",
		"&eth;"    => "",
		"&ntilde;" => "",
		"&ograve;" => "",
		"&oacute;" => "",
		"&ocirc;"  => "",
		"&otilde;" => "",
		"&ouml;"   => "",
		"&oslash;" => "",
		"&ugrave;" => "",
		"&uacute;" => "",
		"&ucirc;"  => "",
		"&uuml;"   => "",
		"&yacute;" => "",
		"&thorn;"  => "",
		"&yuml;"   => "",
		"&THORN;"  => "",
		"&szlig;"  => "",
		"&Agrave;" => "",
		"&Aacute;" => "",
		"&Acirc;"  => "",
		"&Atilde;" => "",
		"&Auml;"   => "",
		"&Aring;"  => "",
		"&Aelig;"  => "",
		"&Ccedil;" => "",
		"&Egrave;" => "",
		"&Eacute;" => "",
		"&Ecirc;"  => "",
		"&Euml;"   => "",
		"&Igrave;" => "",
		"&Iacute;" => "",
		"&Icirc;"  => "",
		"&Iuml;"   => "",
		"&ETH;"    => "",
		"&Ntilde;" => "",
		"&Ograve;" => "",
		"&Oacute;" => "",
		"&Ocirc;"  => "",
		"&Otilde;" => "",
		"&Ouml;"   => "",
		"&Oslash;" => "",
		"&Ugrave;" => "",
		"&Uacute;" => "",
		"&Ucirc;"  => "",
		"&Uuml;"   => "",
		"&Yacute;" => "",
		"&Yhorn;"  => "",
		"&Yuml;"   => ""
		);

	$GLOBALS['entities'] = $entities;

	//Apache multi indexes parameters
	$apache_indexes = array (  
		"N=A" => 1,
		"N=D" => 1,
		"M=A" => 1,
		"M=D" => 1,
		"S=A" => 1,
		"S=D" => 1,
		"D=A" => 1,
		"D=D" => 1,
		"C=N;O=A" => 1,
		"C=M;O=A" => 1,
		"C=S;O=A" => 1,
		"C=D;O=A" => 1,
		"C=N;O=D" => 1,
		"C=M;O=D" => 1,
		"C=S;O=D" => 1,
		"C=D;O=D" => 1);


	function remove_accents($string) {
		return (strtr($string, "",
					  "aaaaaaaaaaaaaaoooooooooooooeeeeeeeeecceiiiiiiiiuuuuuuuunntsyy"));
	}

	$common = array
		(
		);

	$lines = @file($include_dir.'/common.txt');

	if (is_array($lines)) {
		while (list($id, $word) = each($lines))
			$common[trim($word)] = 1;
	}

	$ext = array
		(
		);

	$lines = @file('ext.txt');

	if (is_array($lines)) {
		while (list($id, $word) = each($lines))
			$ext[] = trim($word);
	}

	function is_num($var) {
	   for ($i=0;$i<strlen($var);$i++) {
		   $ascii_code=ord($var[$i]);
		   if ($ascii_code >=49 && $ascii_code <=57){
			   continue;
		   } else {
			   return false;
		   }
	   }
  		   return true;
	}

	function getHttpVars() {
		$superglobs = array(
			'_POST',
			'_GET',
			'HTTP_POST_VARS',
			'HTTP_GET_VARS');

		$httpvars = array();

		// extract the right array
		foreach ($superglobs as $glob) {
			global $$glob;
			if (isset($$glob) && is_array($$glob)) {
				$httpvars = $$glob;
			 }
			if (count($httpvars) > 0)
				break;
		}
		return $httpvars;

	}
function countSubstrs($haystack, $needle) {
	$count = 0;
	while(strpos($haystack,$needle) !== false) {
	   $haystack = substr($haystack, (strpos($haystack,$needle) + 1));
	   $count++;
	}
	return $count;
}

function quote_replace($str) {

		$str = str_replace("\"",
					  "&quot;", $str);
		return str_replace("'","&apos;", $str);
}


function fst_lt_snd($version1, $version2) {

	$list1 = explode(".", $version1);
	$list2 = explode(".", $version2);

	$length = count($list1);
	$i = 0;
	while ($i < $length) {
		if ($list1[$i] < $list2[$i])
			return true;
		if ($list1[$i] > $list2[$i])
			return false;
		$i++;
	}
	
	if ($length < count($list2)) {
		return true;
	}
	return false;

}

function get_dir_contents($dir) {
	$contents = Array();
	if ($handle = opendir($dir)) {
		while (false !== ($file = readdir($handle))) {
			if ($file != "." && $file != "..") {
				$contents[] = $file;
			}
		}
		closedir($handle);
	}
	return $contents;
}

function replace_ampersand($str) {
	return str_replace("&", "%26", $str);
}



    /**
	* Stemming algorithm
    * Copyright (c) 2005 Richard Heyes (http://www.phpguru.org/)
    * All rights reserved.
    * This script is free software.
	* Modified to work with php versions prior 5 by Ando Saabas
    */

	/**
	* Regex for matching a consonant
	*/
	$regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';


	/**
	* Regex for matching a vowel
	*/
	$regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';

	/**
	* Stems a word. Simple huh?
	*
	* @param  string $word Word to stem
	* @return string       Stemmed word
	*/
	function stem($word)
	{
		if (strlen($word) <= 2) {
			return $word;
		}

		$word = step1ab($word);
		$word = step1c($word);
		$word = step2($word);
		$word = step3($word);
		$word = step4($word);
		$word = step5($word);

		return $word;
	}


	/**
	* Step 1
	*/
	function step1ab($word)
	{
		global $regex_vowel, $regex_consonant;
		// Part a
		if (substr($word, -1) == 's') {

			   replace($word, 'sses', 'ss')
			OR replace($word, 'ies', 'i')
			OR replace($word, 'ss', 'ss')
			OR replace($word, 's', '');
		}

		// Part b
		if (substr($word, -2, 1) != 'e' OR !replace($word, 'eed', 'ee', 0)) { // First rule
			$v = $regex_vowel;
			// ing and ed
			if (   preg_match("#$v+#", substr($word, 0, -3)) && replace($word, 'ing', '')
				OR preg_match("#$v+#", substr($word, 0, -2)) && replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons

				// If one of above two test successful
				if (    !replace($word, 'at', 'ate')
					AND !replace($word, 'bl', 'ble')
					AND !replace($word, 'iz', 'ize')) {

					// Double consonant ending
					if (    doubleConsonant($word)
						AND substr($word, -2) != 'll'
						AND substr($word, -2) != 'ss'
						AND substr($word, -2) != 'zz') {

						$word = substr($word, 0, -1);

					} else if (m($word) == 1 AND cvc($word)) {
						$word .= 'e';
					}
				}
			}
		}

		return $word;
	}


	/**
	* Step 1c
	*
	* @param string $word Word to stem
	*/
	function step1c($word)
	{
		global $regex_vowel, $regex_consonant;
		$v = $regex_vowel;

		if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
			replace($word, 'y', 'i');
		}

		return $word;
	}


	/**
	* Step 2
	*
	* @param string $word Word to stem
	*/
	function step2($word)
	{
		switch (substr($word, -2, 1)) {
			case 'a':
				   replace($word, 'ational', 'ate', 0)
				OR replace($word, 'tional', 'tion', 0);
				break;

			case 'c':
				   replace($word, 'enci', 'ence', 0)
				OR replace($word, 'anci', 'ance', 0);
				break;

			case 'e':
				replace($word, 'izer', 'ize', 0);
				break;

			case 'g':
				replace($word, 'logi', 'log', 0);
				break;

			case 'l':
				   replace($word, 'entli', 'ent', 0)
				OR replace($word, 'ousli', 'ous', 0)
				OR replace($word, 'alli', 'al', 0)
				OR replace($word, 'bli', 'ble', 0)
				OR replace($word, 'eli', 'e', 0);
				break;

			case 'o':
				   replace($word, 'ization', 'ize', 0)
				OR replace($word, 'ation', 'ate', 0)
				OR replace($word, 'ator', 'ate', 0);
				break;

			case 's':
				   replace($word, 'iveness', 'ive', 0)
				OR replace($word, 'fulness', 'ful', 0)
				OR replace($word, 'ousness', 'ous', 0)
				OR replace($word, 'alism', 'al', 0);
				break;

			case 't':
				   replace($word, 'biliti', 'ble', 0)
				OR replace($word, 'aliti', 'al', 0)
				OR replace($word, 'iviti', 'ive', 0);
				break;
		}

		return $word;
	}


	/**
	* Step 3
	*
	* @param string $word String to stem
	*/
	function step3($word)
	{
		switch (substr($word, -2, 1)) {
			case 'a':
				replace($word, 'ical', 'ic', 0);
				break;

			case 's':
				replace($word, 'ness', '', 0);
				break;

			case 't':
				   replace($word, 'icate', 'ic', 0)
				OR replace($word, 'iciti', 'ic', 0);
				break;

			case 'u':
				replace($word, 'ful', '', 0);
				break;

			case 'v':
				replace($word, 'ative', '', 0);
				break;

			case 'z':
				replace($word, 'alize', 'al', 0);
				break;
		}

		return $word;
	}


	/**
	* Step 4
	*
	* @param string $word Word to stem
	*/
	function step4($word)
	{
		switch (substr($word, -2, 1)) {
			case 'a':
				replace($word, 'al', '', 1);
				break;

			case 'c':
				   replace($word, 'ance', '', 1)
				OR replace($word, 'ence', '', 1);
				break;

			case 'e':
				replace($word, 'er', '', 1);
				break;

			case 'i':
				replace($word, 'ic', '', 1);
				break;

			case 'l':
				   replace($word, 'able', '', 1)
				OR replace($word, 'ible', '', 1);
				break;

			case 'n':
				   replace($word, 'ant', '', 1)
				OR replace($word, 'ement', '', 1)
				OR replace($word, 'ment', '', 1)
				OR replace($word, 'ent', '', 1);
				break;

			case 'o':
				if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
				   replace($word, 'ion', '', 1);
				} else {
					replace($word, 'ou', '', 1);
				}
				break;

			case 's':
				replace($word, 'ism', '', 1);
				break;

			case 't':
				   replace($word, 'ate', '', 1)
				OR replace($word, 'iti', '', 1);
				break;

			case 'u':
				replace($word, 'ous', '', 1);
				break;

			case 'v':
				replace($word, 'ive', '', 1);
				break;

			case 'z':
				replace($word, 'ize', '', 1);
				break;
		}

		return $word;
	}


	/**
	* Step 5
	*
	* @param string $word Word to stem
	*/
	function step5($word)
	{
		// Part a
		if (substr($word, -1) == 'e') {
			if (m(substr($word, 0, -1)) > 1) {
				replace($word, 'e', '');

			} else if (m(substr($word, 0, -1)) == 1) {

				if (!cvc(substr($word, 0, -1))) {
					replace($word, 'e', '');
				}
			}
		}

		// Part b
		if (m($word) > 1 AND doubleConsonant($word) AND substr($word, -1) == 'l') {
			$word = substr($word, 0, -1);
		}

		return $word;
	}


	/**
	* Replaces the first string with the second, at the end of the string. If third
	* arg is given, then the preceding string must match that m count at least.
	*
	* @param  string $str   String to check
	* @param  string $check Ending to check for
	* @param  string $repl  Replacement string
	* @param  int    $m     Optional minimum number of m() to meet
	* @return bool          Whether the $check string was at the end
	*                       of the $str string. True does not necessarily mean
	*                       that it was replaced.
	*/
	function replace(&$str, $check, $repl, $m = null)
	{
		$len = 0 - strlen($check);

		if (substr($str, $len) == $check) {
			$substr = substr($str, 0, $len);
			if (is_null($m) OR m($substr) > $m) {
				$str = $substr . $repl;
			}

			return true;
		}

		return false;
	}


	/**
	* What, you mean it's not obvious from the name?
	*
	* m() measures the number of consonant sequences in $str. if c is
	* a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
	* presence,
	*
	* <c><v>       gives 0
	* <c>vc<v>     gives 1
	* <c>vcvc<v>   gives 2
	* <c>vcvcvc<v> gives 3
	*
	* @param  string $str The string to return the m count for
	* @return int         The m count
	*/
	function m($str)
	{
		global $regex_vowel, $regex_consonant;
		$c = $regex_consonant;
		$v = $regex_vowel;

		$str = preg_replace("#^$c+#", '', $str);
		$str = preg_replace("#$v+$#", '', $str);

		preg_match_all("#($v+$c+)#", $str, $matches);

		return count($matches[1]);
	}


	/**
	* Returns true/false as to whether the given string contains two
	* of the same consonant next to each other at the end of the string.
	*
	* @param  string $str String to check
	* @return bool        Result
	*/
	function doubleConsonant($str)
	{
		global $regex_consonant;
		$c = $regex_consonant;

		return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
	}


	/**
	* Checks for ending CVC sequence where second C is not W, X or Y
	*
	* @param  string $str String to check
	* @return bool        Result
	*/
	function cvc($str)
	{
		$c = $regex_consonant;
		$v = $regex_vowel;

		return     preg_match("#($c$v$c)$#", $str, $matches)
			   AND strlen($matches[1]) == 3
			   AND $matches[1]{2} != 'w'
			   AND $matches[1]{2} != 'x'
			   AND $matches[1]{2} != 'y';
	}

// END  ---------------------------------------------------------------------------------- COMMON FUNCTIONS 

// BEGIN  ---------------------------------------------------------------------------------- SEARCH FUNCTIONS 


	
	function swap_max (&$arr, $start, $domain) {
		$pos  = $start;
		$maxweight = $arr[$pos]['weight'];
		for  ($i = $start; $i< count($arr); $i++) {
			if ($arr[$i]['domain'] == $domain) {
				$pos = $i;
				$maxweight = $arr[$i]['weight'];
				break;
			}
			if ($arr[$i]['weight'] > $maxweight) {
				$pos = $i;
				$maxweight = $arr[$i]['weight'];
			}
		}
		$temp = $arr[$start];
		$arr[$start] = $arr[$pos];
		$arr[$pos] = $temp;
	}

	function sort_with_domains (&$arr) {
		$domain = -1;
		for  ($i = 0; $i< count($arr)-1; $i++) {
			swap_max($arr, $i, $domain);
			$domain = $arr[$i]['domain'];
		}
	}
	
	function cmp($a, $b) {
		if ($a['weight'] == $b['weight'])
			return 0;

		return ($a['weight'] > $b['weight']) ? -1 : 1;
	}

	function addmarks($a) {
		$a = eregi_replace("[ ]+", " ", $a);
		$a = str_replace(" +", "+", $a);
		$a = str_replace(" ", "+", $a);
		return $a;
	}

	function makeboollist($a) {
		global $entities, $stem_words;
		while ($char = each($entities)) {
			$a = eregi_replace($char[0], $char[1], $a);
		}
		$a = trim($a);

		$a = eregi_replace("&quot;", "\"", $a);
		$returnWords = array();
		//get all phrases
		$regs = Array();
		while (eregi("([-]?)\"([^\"]+)\"", $a, $regs)) {
			if ($regs[1] == '') {
				$returnWords['+s'][] = $regs[2];
				$returnWords['hilight'][] = $regs[2];
			} else {
				$returnWords['-s'][] = $regs[2];
			}
			$a = str_replace($regs[0], "", $a);
		}
		$a = strtolower(eregi_replace("[ ]+", " ", $a));
//		$a = remove_accents($a);
		$a = trim($a);
		$words = explode(' ', $a);
		if ($a=="") {
			$limit = 0;
		} else {
		$limit = count($words);
		}


		$k = 0;
		//get all words (both include and exlude)
		$includeWords = array();
		while ($k < $limit) {
			if (substr($words[$k], 0, 1) == '+') {
				$includeWords[] = substr($words[$k], 1);
				if (!ignoreWord(substr($words[$k], 1))) {
					$returnWords['hilight'][] = substr($words[$k], 1);
					if ($stem_words == 1) {
						$returnWords['hilight'][] = stem(substr($words[$k], 1));
					}
				}
			} else if (substr($words[$k], 0, 1) == '-') {
				$returnWords['-'][] = substr($words[$k], 1);
			} else {
				$includeWords[] = $words[$k];
				if (!ignoreWord($words[$k])) {
					$returnWords['hilight'][] = $words[$k];
					if ($stem_words == 1) {
						$returnWords['hilight'][] = stem($words[$k]);
					}
				}
			}
			$k++;
		}
		//add words from phrases to includes
		if (isset($returnWords['+s'])) {
			foreach ($returnWords['+s'] as $phrase) {
				$phrase = strtolower(eregi_replace("[ ]+", " ", $phrase));
				$phrase = trim($phrase);
				$temparr = explode(' ', $phrase);
				foreach ($temparr as $w)
					$includeWords[] = $w;
			}
		}

		foreach ($includeWords as $word) {
			if (!($word =='')) {
				if (ignoreWord($word)) {

					$returnWords['ignore'][] = $word;
				} else {
					$returnWords['+'][] = $word;
				}	
			}

		}
		return $returnWords;

	}

	function ignoreword($word) {
		global $common;
		global $min_word_length;
		global $index_numbers;
		if ($index_numbers == 1) {
			$pattern = "[a-z0-9]+";
		} else {
			$pattern = "[a-z]+";
		}
		if (strlen($word) < $min_word_length || (!eregi($pattern, remove_accents($word))) || ($common[$word] == 1)) {
			return 1;
		} else {
			return 0;
		}
	}

	function search($searchstr, $category, $start, $per_page, $type, $domain) {
		global $length_of_link_desc,$mysql_table_prefix, $show_meta_description, $merge_site_results, $stem_words, $did_you_mean_enabled ;
	
		$did_you_mean_enabled = true;	
	
		$possible_to_find = 1;
		$result = mysql_query("select domain_id from ".$mysql_table_prefix."domains where domain = '$domain'");
		if (mysql_num_rows($result)> 0) {
			$thisrow = mysql_fetch_array($result);
			$domain_qry = "and domain = ".$thisrow[0];
		} else {
			$domain_qry = "";
		}

		//find all sites that should not be included in the result
		if (count($searchstr['+']) == 0) {
			return null;
		}
		$wordarray = $searchstr['-'];
		$notlist = array();
		$not_words = 0;
		while ($not_words < count($wordarray)) {
			if ($stem_words == 1) {
				$searchword = addslashes(stem($wordarray[$not_words]));
			} else {
				$searchword = addslashes($wordarray[$not_words]);
			}
			$wordmd5 = substr(md5($searchword), 0, 1);

            $query1 = "SELECT link_id from ".$mysql_table_prefix."link_keyword$wordmd5, ".$mysql_table_prefix."keywords where ".$mysql_table_prefix."link_keyword$wordmd5.keyword_id= ".$mysql_table_prefix."keywords.keyword_id and keyword='$searchword'";

			$result = mysql_query($query1);

			while ($row = mysql_fetch_row($result)) {	
				$notlist[$not_words]['id'][$row[0]] = 1;
			}
			$not_words++;
		}
		

		//find all sites containing the search phrase
		$wordarray = $searchstr['+s'];
		$phrase_words = 0;
		while ($phrase_words < count($wordarray)) {

			$searchword = addslashes($wordarray[$phrase_words]);
			$query1 = "SELECT link_id from ".$mysql_table_prefix."links where fulltxt like '% $searchword%'";
			echo mysql_error();
			$result = mysql_query($query1);
			$num_rows = mysql_num_rows($result);
			if ($num_rows == 0) {
				$possible_to_find = 0;
				break;
			}
			while ($row = mysql_fetch_row($result)) {	
				$phraselist[$phrase_words]['id'][$row[0]] = 1;
			}
			$phrase_words++;
		}
		

		if (($category> 0) && $possible_to_find==1) {
			$allcats = get_cats($category);
			$catlist = implode(",", $allcats);
			$query1 = "select link_id from ".$mysql_table_prefix."links, ".$mysql_table_prefix."sites, ".$mysql_table_prefix."categories, ".$mysql_table_prefix."site_category where ".$mysql_table_prefix."links.site_id = ".$mysql_table_prefix."sites.site_id and ".$mysql_table_prefix."sites.site_id = ".$mysql_table_prefix."site_category.site_id and ".$mysql_table_prefix."site_category.category_id in ($catlist)";
			$result = mysql_query($query1);
			echo mysql_error();
			$num_rows = mysql_num_rows($result);
			if ($num_rows == 0) {
				$possible_to_find = 0;
			}
			while ($row = mysql_fetch_row($result)) {	
				$category_list[$row[0]] = 1;
			}
		}


		//find all sites that include the search word		
		$wordarray = $searchstr['+'];
		$words = 0;
		$starttime = getmicrotime();
		while (($words < count($wordarray)) && $possible_to_find == 1) {
			if ($stem_words == 1) {
				$searchword = addslashes(stem($wordarray[$words]));
			} else {
				$searchword = addslashes($wordarray[$words]);
			}
			$wordmd5 = substr(md5($searchword), 0, 1);
			$query1 = "SELECT distinct link_id, weight, domain from ".$mysql_table_prefix."link_keyword$wordmd5, ".$mysql_table_prefix."keywords where ".$mysql_table_prefix."link_keyword$wordmd5.keyword_id= ".$mysql_table_prefix."keywords.keyword_id and keyword='$searchword' $domain_qry order by weight desc";
			echo mysql_error();
			$result = mysql_query($query1);
			$num_rows = mysql_num_rows($result);
			if ($num_rows == 0) {
				if ($type != "or") {
					$possible_to_find = 0;
					break;
				}
			}
			if ($type == "or") {
				$indx = 0;
			} else {
				$indx = $words;
			}

			while ($row = mysql_fetch_row($result)) {	
				$linklist[$indx]['id'][] = $row[0];
				$domains[$row[0]] = $row[2];
				$linklist[$indx]['weight'][$row[0]] = $row[1];
			}
			$words++;
		}


		if ($type == "or") {
			$words = 1;
		}
		$result_array_full = Array();

		if ($words == 1 && $not_words == 0 && $category < 1) { //if there is only one search word, we already have the result
			$result_array_full = $linklist[0]['weight'];
		} else { //otherwise build an intersection of all the results
			$j= 1;
			$min = 0;
			while ($j < $words) {
				if (count($linklist[$min]['id']) > count($linklist[$j]['id'])) {
					$min = $j;
				}
				$j++;
			}

			$j = 0;


			$temp_array = $linklist[$min]['id'];
			$count = 0;
			while ($j < count($temp_array)) {
				$k = 0; //and word counter
				$n = 0; //not word counter
				$o = 0; //phrase word counter
				$weight = 1;
				$break = 0;
				while ($k < $words && $break== 0) {
					if ($linklist[$k]['weight'][$temp_array[$j]] > 0) {
						$weight = $weight + $linklist[$k]['weight'][$temp_array[$j]];
					} else {
						$break = 1;
					}
					$k++;
				}
				while ($n < $not_words && $break== 0) {
					if ($notlist[$n]['id'][$temp_array[$j]] > 0) {
						$break = 1;
					}
					$n++;
				}				

				while ($o < $phrase_words && $break== 0) {
					if ($phraselist[$n]['id'][$temp_array[$j]] != 1) {
						$break = 1;
					}
					$o++;
				}
				if ($break== 0 && $category > 0 && $category_list[$temp_array[$j]] != 1) {
					$break = 1;
				}

				if ($break == 0) {
					$result_array_full[$temp_array[$j]] = $weight;
					$count ++;
				}
				$j++;
			}
		}//word == 1

		$end = getmicrotime()- $starttime;


		if ((count($result_array_full) == 0 || $possible_to_find == 0) && $did_you_mean_enabled == 1) {
			reset ($searchstr['+']);
			foreach ($searchstr['+'] as $word) {
				$word = addslashes($word);
				$result = mysql_query("select keyword from ".$mysql_table_prefix."keywords where soundex(keyword) = soundex('$word')");
				$max_distance = 100;
				$near_word ="";
				while ($row=mysql_fetch_row($result)) {
					
					$distance = levenshtein($row[0], $word);
					if ($distance < $max_distance && $distance <4) {
						$max_distance = $distance;
						$near_word = $row[0];
					}
				}

				if ($near_word != "" && $word != $near_word) {
					$near_words[$word] = $near_word;
				}

			}
			$res['did_you_mean'] = $near_words;
			return $res;
		}
		if (count($result_array_full) == 0) {
			return null;
		}
		arsort ($result_array_full);


		if ($merge_site_results == 1 && $domain_qry == "") {
			while (list($key, $value) = each($result_array_full)) {
				if (!isset($domains_to_show[$domains[$key]])) {
					$result_array_temp[$key] = $value;
					$domains_to_show[$domains[$key]] = 1;
				} else if ($domains_to_show[$domains[$key]] ==  1) {
					$domains_to_show[$domains[$key]] = Array ($key => $value);
				}
			}
		} else {
			$result_array_temp = $result_array_full;
		}
	
		
		while (list($key, $value) = each ($result_array_temp)) {
			$result_array[$key] = $value;
			if (isset ($domains_to_show[$domains[$key]]) && $domains_to_show[$domains[$key]] != 1) {
				list ($k, $v) = each($domains_to_show[$domains[$key]]);
				$result_array[$k] = $v;
			}
		}

		$results = count($result_array);

		$keys = array_keys($result_array);
		$maxweight = $result_array[$keys[0]];


		for ($i = ($start -1)*$per_page; $i <min($results, ($start -1)*$per_page + $per_page) ; $i++) {
			$in[] = $keys[$i];

		}
		if (!is_array($in)) {
			$res['results'] = $results;
			return $res;
		}

		$inlist = implode(",", $in);


		if ($length_of_link_desc == 0) {
			$fulltxt = "fulltxt";
		} else {
			$fulltxt = "substring(fulltxt, 1, $length_of_link_desc)";
		}

		$query1 = "SELECT distinct link_id, url, title, description,  $fulltxt, size FROM ".$mysql_table_prefix."links WHERE link_id in ($inlist)";

		$result = mysql_query($query1);
		echo mysql_error();

		$i = 0;
		while ($row = mysql_fetch_row($result)) {
			$res[$i]['title'] = $row[2];
			$res[$i]['url'] = $row[1];
			if ($row[3] != null && $show_meta_description == 1)
				$res[$i]['fulltxt'] = $row[3];
			else 
				$res[$i]['fulltxt'] = $row[4];
			$res[$i]['size'] = $row[5];
			$res[$i]['weight'] = $result_array[$row[0]];
			$dom_result = mysql_query("select domain from ".$mysql_table_prefix."domains where domain_id='".$domains[$row[0]]."'");
			$dom_row = mysql_fetch_row($dom_result);
			$res[$i]['domain'] = $dom_row[0];
			$i++;
		}



		if ($merge_site_results  && $domain_qry == "") {
			sort_with_domains($res);
		} else {
			usort($res, "cmp"); 	
		}
		echo mysql_error();
		$res['maxweight'] = $maxweight;
		$res['results'] = $results;
		return $res;
	/**/
	}

$GLOBALS['results_per_page'] = 10;

function get_search_results($query, $start, $category, $searchtype, $results, $domain) {
	global $sph_spiderMessages, $results_per_page,
		$links_to_next,
		$show_query_scores,
		$mysql_table_prefix,
		$desc_length;

	if ($results != "") {
		$results_per_page = $results;
	}

	if ($searchtype == "phrase") {
	   $query=str_replace('"','',$query);
	   $query = "\"".$query."\"";
	}

	$starttime = getmicrotime();
	// catch " if only one time entered
        if (substr_count($query,'"')==1){
           $query=str_replace('"','',$query);
        }   
	$words = makeboollist($query);
	$ignorewords = $words['ignore'];

	
	$full_result['ignore_words'] = $words['ignore'];

	if ($start==0) 
		$start=1;
	$result = search($words, $category, $start, $results_per_page, $searchtype, $domain);
	$query= stripslashes($query);

	$entitiesQuery = htmlspecialchars($query);
	$full_result['ent_query'] = $entitiesQuery;

	$endtime = getmicrotime() - $starttime;
	$rows = $result['results'];
	$time = round($endtime*100)/100;

	
	$full_result['time'] = $time;
	
	$did_you_mean = "";


	if (isset($result['did_you_mean'])) {
		$did_you_mean_b=$query;
		$did_you_mean=$query;
		while (list($key, $val) = each($result['did_you_mean'])) {
			if ($key != $val) {
				$did_you_mean_b = str_replace($key, "<b>$val</b>", $did_you_mean_b);
				$did_you_mean = str_replace($key, "$val", $did_you_mean);
			}
		}
	}

	$full_result['did_you_mean'] = $did_you_mean;
	$full_result['did_you_mean_b'] = $did_you_mean_b;

	$matchword = $sph_spiderMessages["matches"];
	if ($rows == 1) {
		$matchword= $sph_spiderMessages["match"];
	}

	$num_of_results = count($result) - 2;
	
	
	
	$full_result['num_of_results'] = $num_of_results;


	if ($start < 2)
		saveToLog(addslashes($query), $time, $rows);
	$from = ($start-1) * $results_per_page+1;
	$to = min(($start)*$results_per_page, $rows);

	
	$full_result['from'] = $from;
	$full_result['to'] = $to;
	$full_result['total_results'] = $rows;

	if ($rows>0) {
		$maxweight = $result['maxweight'];
		$i = 0;
		while ($i < $num_of_results && $i < $results_per_page) {
			$title = $result[$i]['title'];
			$url = $result[$i]['url'];
			$fulltxt = $result[$i]['fulltxt'];
			$page_size = $result[$i]['size'];
			$domain = $result[$i]['domain'];
			if ($page_size!="") 
				$page_size = number_format($page_size, 1)."kb";
			
			
			$txtlen = strlen($fulltxt);
			if ($txtlen > $desc_length) {
				$places = array();
				foreach($words['hilight'] as $word) {
					$tmp = strtolower($fulltxt);
					$found_in = strpos($tmp, $word);
					$sum = -strlen($word);
					while (!($found_in =='')) {
						$pos = $found_in+strlen($word);
						$sum += $pos;  //FIX!!
						$tmp = substr($tmp, $pos);
						$places[] = $sum;
						$found_in = strpos($tmp, $word);

					}
				}
				sort($places);
				$x = 0;
				$begin = 0;
				$end = 0;
				while(list($id, $place) = each($places)) {
					while ($places[$id + $x] - $place < $desc_length && $x+$id < count($places) && $place < strlen($fulltxt) -$desc_length) {
						$x++;
						$begin = $id;
						$end = $id + $x;
					}
				}

				$begin_pos = max(0, $places[$begin] - 30);
				$fulltxt = substr($fulltxt, $begin_pos, $desc_length);

				if ($places[$begin] > 0) {
					$begin_pos = strpos($fulltxt, " ");
				}
				$fulltxt = substr($fulltxt, $begin_pos, $desc_length);
				$fulltxt = substr($fulltxt, 0, strrpos($fulltxt, " "));
				$fulltxt = $fulltxt;
			} else {

			}

			$weight = number_format($result[$i]['weight']/$maxweight*100, 2);
			if ($title=='')
				$title = $sph_spiderMessages["Untitled"];
			$regs = Array();
			foreach($words['hilight'] as $change) {
				while (@eregi("[^\>](".$change.")[^\<]", " ".$title." ", $regs)) {
					$title = eregi_replace($regs[1], "<b>".$regs[1]."</b>", $title);
				}

				while (@eregi("[^\>](".$change.")[^\<]", " ".$fulltxt." ", $regs)) {
					$fulltxt = eregi_replace($regs[1], "<b>".$regs[1]."</b>", $fulltxt);
				}
				$url2 = $url;
				while (@eregi("[^\>](".$change.")[^\<]", $url2, $regs)) {
					$url2 = eregi_replace($regs[1], "<b>".$regs[1]."</b>", $url2);
				}
			}

			if (strlen($title) > 80) {
				$title = substr($title, 0,75)."...";
			}

			$num = $from + $i;
			$full_result['qry_results'][$i]['num'] =  $num;
			$full_result['qry_results'][$i]['weight'] =  $weight;
			$full_result['qry_results'][$i]['url'] =  $url;
			$full_result['qry_results'][$i]['title'] =  $title;
			$full_result['qry_results'][$i]['fulltxt'] =  $fulltxt;
			$full_result['qry_results'][$i]['url2'] =  $url2;
			$full_result['qry_results'][$i]['page_size'] =  $page_size;
			$full_result['qry_results'][$i]['domain_name'] =  $domain;
			$i++;
		}
	}



	$pages = ceil($rows / $results_per_page);
	$full_result['pages'] = $pages;
	$prev = $start - 1;
	$full_result['prev'] = $prev;
	$next = $start + 1;
	$full_result['next'] = $next;
	$full_result['start'] = $start;
	$full_result['query'] = $query;

	if ($from <= $to) {

		$firstpage = $start - $links_to_next;
		if ($firstpage < 1) $firstpage = 1;
		$lastpage = $start + $links_to_next;
		if ($lastpage > $pages) $lastpage = $pages;

		for ($x=$firstpage; $x<=$lastpage; $x++)
			$full_result['other_pages'][] = $x;

	}

	return $full_result;

}

// END ---------------------------------------------------------------------------------- SEARCH FUNCTIONS 


// BEGIN ---------------------------------------------------------------------------------- SPIDER FUNCTIONS 


function getmicrotime(){
    list($usec, $sec) = explode(" ",microtime());
    return ((float)$usec + (float)$sec);
    }



function poweredby () {
	global $sph_spiderMessages;
    //If you want to remove this, please donate to the project at http://www.sphider.eu/donate.php
    print $sph_spiderMessages['Powered by'];?>  <a href="http://www.sphider.eu/"><img src="sphider-logo.png" border="0" style="vertical-align: middle" alt="Sphider"></a>

    <?php 
}


function saveToLog ($query, $elapsed, $results) {
        global $mysql_table_prefix;
    if ($results =="") {
        $results = 0;
    }
    $query =  "insert into ".$mysql_table_prefix."query_log (query, time, elapsed, results) values ('$query', now(), '$elapsed', '$results')";
	mysql_query($query);
                    
	echo mysql_error();
                        
}
	function microtime_float(){
	   list($usec, $sec) = explode(" ", microtime());
	   return ((float)$usec + (float)$sec);
	}

	
	function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) {
		global $entities, $min_delay;
		global $command_line;
		global $min_words_per_page;
		global $supdomain;
		global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr;
		$needsReindex = 1;
		$deletable = 0;

		$url_status = url_status($url);
		$thislevel = $level - 1;

		if (strstr($url_status['state'], "Relocation")) {
			$url = eregi_replace(" ", "", url_purify($url_status['path'], $url, $can_leave_domain));

			if ($url <> '') {
				$result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'");
				echo mysql_error();
				$rows = mysql_numrows($result);
				if ($rows == 0) {
					mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')");
					echo mysql_error();
				}
			}

			$url_status['state'] == "redirected";
		}

		/*
		if ($indexdate <> '' && $url_status['date'] <> '') {
			if ($indexdate > $url_status['date']) {
				$url_status['state'] = "Date checked. Page contents not changed";
				$needsReindex = 0;
			}
		}*/
		ini_set("user_agent", $user_agent);
		if ($url_status['state'] == 'ok') {
			$OKtoIndex = 1;
			$file_read_error = 0;
			
			if (time() - $delay_time < $min_delay) {
				sleep ($min_delay- (time() - $delay_time));
			}
			$delay_time = time();
			if (!fst_lt_snd(phpversion(), "4.3.0")) {
				$file = file_get_contents($url);
				if ($file === FALSE) {
					$file_read_error = 1;
				}
			} else {
				$fl = @fopen($url, "r");
				if ($fl) {
					while ($buffer = @fgets($fl, 4096)) {
						$file .= $buffer;
					}
				} else {
					$file_read_error = 1;
				}

				fclose ($fl);
			}
			if ($file_read_error) {
				$contents = getFileContents($url);
				$file = $contents['file'];
			}
			

			$pageSize = number_format(strlen($file)/1024, 2, ".", "");
			printPageSizeReport($pageSize);

			if ($url_status['content'] != 'text') {
				$file = extract_text($file, $url_status['content']);
			}

			printStandardReport('starting', $command_line);
		

			$newmd5sum = md5($file);
			

			if ($md5sum == $newmd5sum) {
				printStandardReport('md5notChanged',$command_line);
				$OKtoIndex = 0;
			} else if (isDuplicateMD5($newmd5sum)) {
				$OKtoIndex = 0;
				printStandardReport('duplicate',$command_line);
			}

			if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) {
				$urlparts = parse_url($url);
				$newdomain = $urlparts['host'];
				$type = 0;
				
		/*		if ($newdomain <> $domain)
					$domainChanged = 1;

				if ($domaincb==1) {
					$start = strlen($newdomain) - strlen($supdomain);
					if (substr($newdomain, $start) == $supdomain) {
						$domainChanged = 0;
					}
				}*/

				// remove link to css file
				//get all links from file
				$data = clean_file($file, $url, $url_status['content']);

				if ($data['noindex'] == 1) {
					$OKtoIndex = 0;
					$deletable = 1;
					printStandardReport('metaNoindex',$command_line);
				}
	

				$wordarray = unique_array(explode(" ", $data['content']));
	
				if ($data['nofollow'] != 1) {
					$links = get_links($file, $url, $can_leave_domain, $data['base']);
					$links = distinct_array($links);
					$all_links = count($links);
					$numoflinks = 0;
					//if there are any, add to the temp table, but only if there isnt such url already
					if (is_array($links)) {
						reset ($links);

						while ($thislink = each($links)) {
							if ($tmp_urls[$thislink[1]] != 1) {
								$tmp_urls[$thislink[1]] = 1;
								$numoflinks++;
								mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')");
								echo mysql_error();
							}
						}
					}
				} else {
					printStandardReport('noFollow',$command_line);
				}
				
				if ($OKtoIndex == 1) {
					
					$title = $data['title'];
					$host = $data['host'];
					$path = $data['path'];
					$fulltxt = $data['fulltext'];
					$desc = substr($data['description'], 0,254);
					$url_parts = parse_url($url);
					$domain_for_db = $url_parts['host'];

					if (isset($domain_arr[$domain_for_db])) {
						$dom_id = $domain_arr[$domain_for_db];
					} else {
						mysql_query("insert into ".$mysql_table_prefix."domains (domain) values ('$domain_for_db')");
						$dom_id = mysql_insert_id();
						$domain_arr[$domain_for_db] = $dom_id;
					}

					$wordarray = calc_weights ($wordarray, $title, $host, $path, $data['keywords']);

					//if there are words to index, add the link to the database, get its id, and add the word + their relation
					if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
						if ($md5sum == '') {
							mysql_query ("insert into ".$mysql_table_prefix."links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('$site_id', '$url', '$title', '$desc', '$fulltxt', curdate(), '$pageSize', '$newmd5sum', $thislevel)");
							echo mysql_error();
							$result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
							echo mysql_error();
							$row = mysql_fetch_row($result);
							$link_id = $row[0];

							save_keywords($wordarray, $link_id, $dom_id);
							
							printStandardReport('indexed', $command_line);
						}else if (($md5sum <> '') && ($md5sum <> $newmd5sum)) { //if page has changed, start updating

							$result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
							echo mysql_error();
							$row = mysql_fetch_row($result);
							$link_id = $row[0];
							for ($i=0;$i<=15; $i++) {
								$char = dechex($i);
								mysql_query ("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
								echo mysql_error();
							}
							save_keywords($wordarray, $link_id, $dom_id);
							$query = "update ".$mysql_table_prefix."links set title='$title', description ='$desc', fulltxt = '$fulltxt', indexdate=now(), size = '$pageSize', md5sum='$newmd5sum', level=$thislevel where link_id=$link_id";
							mysql_query($query);
							echo mysql_error();
							printStandardReport('re-indexed', $command_line);
						}
					}else {
						printStandardReport('minWords', $command_line);

					}
				}
			}
		} else {
			$deletable = 1;
			printUrlStatus($url_status['state'], $command_line);

		}
		if ($reindex ==1 && $deletable == 1) {
			check_for_removal($url); 
		} else if ($reindex == 1) {
			
		}
		if (!isset($all_links)) {
			$all_links = 0;
		}
		if (!isset($numoflinks)) {
			$numoflinks = 0;
		}
		printLinksReport($numoflinks, $all_links, $command_line);
	}


	function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain) {
		global $mysql_table_prefix, $command_line, $mainurl,  $tmp_urls, $domain_arr, $all_keywords;

		$result = mysql_query("select keyword_ID, keyword from ".$mysql_table_prefix."keywords");
		echo mysql_error();
		while($row=mysql_fetch_array($result)) {
			$all_keywords[addslashes($row[1])] = $row[0];
		}

		$compurl = parse_url($url);
		if ($compurl['path'] == '')
			$url = $url . "/";
	
		$t = microtime();
		$a =  getenv("REMOTE_ADDR");
		$sessid = md5 ($t.$a);
	
	
		$urlparts = parse_url($url);
	
		$domain = $urlparts['host'];
		if (isset($urlparts['port'])) {
			$port = (int)$urlparts['port'];
		}else {
			$port = 80;
		}

		
	
		$result = mysql_query("select site_id from ".$mysql_table_prefix."sites where url='$url'");
		echo mysql_error();
		$row = mysql_fetch_row($result);
		$site_id = $row[0];
		
		if ($site_id != "" && $reindex == 1) {
			mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
			echo mysql_error();
			$result = mysql_query("select url, level from ".$mysql_table_prefix."links where site_id = $site_id");
			while ($row = mysql_fetch_array($result)) {
				$site_link = $row['url'];
				$link_level = $row['level'];
				if ($site_link != $url) {
					mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$site_link', $link_level, '$sessid')");
				}
			}
			
			$qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
					"disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id";
			mysql_query ($qry);
			echo mysql_error();
		} else if ($site_id == '') {
			mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " .
					"values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', $can_leave_domain)");
			echo mysql_error();
			$result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'");
			$row = mysql_fetch_row($result);
			$site_id = $row[0];
		} else {
			mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
					"disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id");
			echo mysql_error();
		}
	
	
		$result = mysql_query("select site_id, temp_id, level, count, num from ".$mysql_table_prefix."pending where site_id='$site_id'");
		echo mysql_error();
		$row = mysql_fetch_row($result);
		$pending = $row[0];
		$level = 0;
		$domain_arr = get_domains();
		if ($pending == '') {
			mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
			echo mysql_error();
		} else if ($pending != '') {
			printStandardReport('continueSuspended',$command_line);
			mysql_query("select temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'");
			echo mysql_error();
			$sessid = $row[1];
			$level = $row[2];
			$pend_count = $row[3] + 1;
			$num = $row[4];
			$pending = 1;
			$tmp_urls = get_temp_urls($sessid);
		}
	
		if ($reindex != 1) {
			mysql_query ("insert into ".$mysql_table_prefix."pending (site_id, temp_id, level, count) values ('$site_id', '$sessid', '0', '0')");
			echo mysql_error();
		}
	
	
		$time = time();
	
		$omit = check_robot_txt($url);
	
		printHeader ($omit, $url, $command_line);
	
	
		$mainurl = $url;
		$num = 0;
	
		while (($level <= $maxlevel && $soption == 'level') || ($soption == 'full')) {
			if ($pending == 1) {
				$count = $pend_count;
				$pending = 0;
			} else
				$count = 0;
	
			$links = array();
	
			$result = mysql_query("select distinct link from ".$mysql_table_prefix."temp where level=$level && id='$sessid' order by link");
			echo mysql_error();
			$rows = mysql_num_rows($result);
	
			if ($rows == 0) {
				break;
			}
	
			$i = 0;
	
			while ($row = mysql_fetch_array($result)) {
				$links[] = $row['link'];
			}
	
			reset ($links);
	
	
			while ($count < count($links)) {
				$num++;
				$thislink = $links[$count];
				$urlparts = parse_url($thislink);
				reset ($omit);
				$forbidden = 0;
				foreach ($omit as $omiturl) {
					$omiturl = trim($omiturl);
	
					$omiturl_parts = parse_url($omiturl);
					if (isset($ommiturl_parts['scheme']) and $omiturl_parts['scheme'] == '') {
						$check_omit = $urlparts['host'] . $omiturl;
					} else {
						$check_omit = $omiturl;
					}
	
					if (strpos($thislink, $check_omit)) {
						printRobotsReport($num, $thislink, $command_line);
						check_for_removal($thislink); 
						$forbidden = 1;
						break;
					}
				}
				
				if (!check_include($thislink, $url_inc, $url_not_inc )) {
					printUrlStringReport($num, $thislink, $command_line);
					check_for_removal($thislink); 
					$forbidden = 1;
				} 
	
				if ($forbidden == 0) {
					printRetrieving($num, $thislink, $command_line);
					$query = "select md5sum, indexdate from ".$mysql_table_prefix."links where url='$thislink'";
					$result = mysql_query($query);
					echo mysql_error();
					$rows = mysql_num_rows($result);
					if ($rows == 0) {
						index_url($thislink, $level+1, $site_id, '',  $domain, '', $sessid, $can_leave_domain, $reindex);

						mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
						echo mysql_error();
					}else if ($rows <> 0 && $reindex == 1) {
						$row = mysql_fetch_array($result);
						$md5sum = $row['md5sum'];
						$indexdate = $row['indexdate'];
						index_url($thislink, $level+1, $site_id, $md5sum,  $domain, $indexdate, $sessid, $can_leave_domain, $reindex);
						mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
						echo mysql_error();
					}else {
						printStandardReport('inDatabase',$command_line);
					}

				}
				$count++;
			}
			$level++;
		}
	
		mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'");
		echo mysql_error();
		mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'");
		echo mysql_error();
		printStandardReport('completed',$command_line);
	

	}

	function index_all() {
		global $mysql_table_prefix;
		$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites");
		echo mysql_error();
    	while ($row=mysql_fetch_row($result)) {
    		$url = $row[0];
	   		$depth = $row[1];
    		$include = $row[2];
    		$not_include = $row[3];
    		$can_leave_domain = $row[4];
    		if ($can_leave_domain=='') {
    			$can_leave_domain=0;
    		}
    		if ($depth == -1) {
    			$soption = 'full';
    		} else {
    			$soption = 'level';
    		}
			index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain);
		}
	}			

	function get_temp_urls ($sessid) {
		global $mysql_table_prefix;
		$result = mysql_query("select link from ".$mysql_table_prefix."temp where id='$sessid'");
		echo mysql_error();
		$tmp_urls = Array();
    	while ($row=mysql_fetch_row($result)) {
			$tmp_urls[$row[0]] = 1;
		}
		return $tmp_urls;
			
	}

	function get_domains () {
		global $mysql_table_prefix;
		$result = mysql_query("select domain_id, domain from ".$mysql_table_prefix."domains");
		echo mysql_error();
		$domains = Array();
    	while ($row=mysql_fetch_row($result)) {
			$domains[$row[1]] = $row[0];
		}
		return $domains;
			
	}

	function commandline_help() {
		print "Usage: php spider.php <options>\n\n";
		print "Options:\n";
		print " -all\t\t Reindex everything in the database\n";
		print " -u <url>\t Set url to index\n";
		print " -f\t\t Set indexing depth to full (unlimited depth)\n";
		print " -d <num>\t Set indexing depth to <num>\n";
		print " -l\t\t Allow spider to leave the initial domain\n";
		print " -r\t\t Set spider to reindex a site\n";
		print " -m <string>\t Set the string(s) that an url must include (use \\n as a delimiter between multiple strings)\n";
		print " -n <string>\t Set the string(s) that an url must not include (use \\n as a delimiter between multiple strings)\n";
	}
	global $command_line, $email_log, $log_handle;
	printStandardReport('quit',$command_line);
	if ($email_log) {
		$indexed = ($all==1) ? 'ALL' : $url;
		$log_report = "";
		if ($log_handle) {
			$log_report = "Log saved into $log_file";
		}
		mail($admin_email, "Sphider indexing report", "Sphider has finished indexing $indexed at ".date("y-m-d H:i:s").". ".$log_report);
	}
	if ( $log_handle) {
		fclose($log_handle);
	}

// END ---------------------------------------------------------------------------------- SPIDER FUNCTIONS 

global $min_words_per_page;

$spiderMessages = Array (
 "noFollow" => Array (
	0 => " <font color=red><b> No-follow flag set</b></font>. ",
	1 => " No-follow flag set."
 ),
 "inDatabase" => Array (
	0 => " <font color=red><b> already in database</b></font><br>",
	1 => " already in database\n"
 ),
 "completed" => Array (
	0 => "<br>Completed at %cur_time.\n<br>",
	1 => "Completed at %cur_time.\n"
 ),
 "starting" => Array (
	0 => " Starting indexing at %cur_time.\n",
	1 => " Starting indexing at %cur_time.\n"
	 ),
 "quit" => Array (
	0 => "</body></html>",
	1 => ""
 ),
 "pageRemoved" => Array (
	0 => " <font color=red>Page removed from index.</font><br>\n",
	1 => " Page removed from index.\n"
 ),
  "continueSuspended" => Array (
	0 => "<br>Continuing suspended indexing.<br>\n",
	1 => "Continuing suspended indexing.\n"
 ),
  "indexed" => Array (
	0 => "<br><b> <font color=\"green\">Indexed</font></b><br>\n",
	1 => " \nIndexed\n"
 ),
"duplicate" => Array (
	0 => " <font color=\"red\"><b>Page is a duplicate.</b></font><br>\n",
	1 => " Page is a duplicate.\n"
 ),
"md5notChanged" => Array (
	0 => " <font color=\"red\"><b>MD5 sum checked. Page content not changed</b></font><br>\n",
	1 => " MD5 sum checked. Page content not changed.\n"
 ),
"metaNoindex" => Array (
	0 => " <font color=\"red\">No-Index flag set in meta tags.</font><br>\n",
	1 => " No-Index flag set in meta tags.\n"
 ),
  "re-indexed" => Array (
	0 => " <font color=\"green\">Re-indexed</font><br>\n",
	1 => " Re-indexed\n"
 ),
"minWords" => Array (
	0 => " <font color=\"red\">Page contains less than $min_words_per_page words</font><br>\n",
	1 => " Page contains less than $min_words_per_page words.\n"
 )
);

$GLOBALS['spiderMessages']  =$spiderMessages;

function printRobotsReport($num, $thislink, $cl) {
	global $print_results, $log_format;
	$log_msg_txt = "$num. Link $thislink: file checking forbidden in robots.txt file.\n";
	$log_msg_html = "<b>$num</b>. Link <b>$thislink</b>: <font color=red>file checking forbidden in robots.txt file</font></br>";
	if ($print_results) {
		if ($cl==0) {
			print $log_msg_html; 
		} else {
			print $log_msg_txt;
		}
		flush();
	}
	if ($log_format=="html") {
		writeToLog($log_msg_html);
	} else {
		writeToLog($log_msg_txt);
	}

}

function printUrlStringReport($num, $thislink, $cl) {
	global $print_results, $log_format;
	$log_msg_txt = "$num. Link $thislink: file checking forbidden  by required/disallowed string rule.\n";
	$log_msg_html = "<b>$num</b>. Link <b>$thislink</b>: <font color=red>file checking forbidden by required/disallowed string rule</font></br>";
	if ($print_results) {
		if ($cl==0) {
			print $log_msg_html;
		} else {
			print $log_msg_txt;
		}
		flush();
	}

	if ($log_format=="html") {
		writeToLog($log_msg_html);
	} else {
		writeToLog($log_msg_txt);
	}
}

function printRetrieving($num, $thislink, $cl) {
	global $print_results, $log_format;
	$log_msg_txt = "$num. Retrieving: $thislink at " . date("H:i:s").".\n";
	$log_msg_html = "<b>$num</b>. Retrieving: <b>$thislink</b> at " . date("H:i:s").".<br>\n";
	if ($print_results) {
		if ($cl==0) {
			print $log_msg_html;
		} else {
			print $log_msg_txt;
		}
		flush();
	}

	if ($log_format=="html") {
		writeToLog($log_msg_html);
	} else {
		writeToLog($log_msg_txt);
	}
}


function printLinksReport($numoflinks, $all_links, $cl) {
	global $print_results, $log_format;
	$log_msg_txt = " Legit links found: $all_links. New links found: $numoflinks\n";
	$log_msg_html = " Links found: <font color=\"blue\"><b>$all_links</b></font>. New links: <font color=\"blue\"><b>$numoflinks</b></font><br>\n";
	if ($print_results) {
		if ($cl==0) {
			print $log_msg_html;
		} else {
			print $log_msg_txt;
		}
		flush();
	}

	if ($log_format=="html") {
		writeToLog($log_msg_html);
	} else {
		writeToLog($log_msg_txt);
	}
}

function printHeader($omit, $url, $cl) {
	global $print_results, $log_format;

	if (count($omit) > 0 ) {
		$urlparts = parse_url($url);
		foreach ($omit as $dir) {			
			$omits[] = $urlparts['scheme']."://".$urlparts['host'].$dir;
		}
	}
	
	$log_msg_txt = "Spidering $url\n";
	if (count($omit) > 0) {
		$log_msg_txt .= "Disallowed files and directories in robots.txt:\n";
		$log_msg_txt .= implode("\n", $omits);
		$log_msg_txt .= "\n\n";
	}

	$log_msg_html_1 = "<html><head><LINK REL=STYLESHEET HREF=\"admin.css\" TYPE=\"text/css\"></head>\n";
	$log_msg_html_1 .= "<body style=\"font-family:Verdana, Arial; font-size:12px\">";
	
	$log_msg_html_link = "[Back to <a href=\"admin.php\">admin</a>]";
	$log_msg_html_2 = "<p><font size=\"+1\">Spidering <b>$url</b></font></p>\n";

	if (count($omit) > 0) {
		$log_msg_html_2 .=  "Disallowed files and directories in robots.txt:<br>\n";
		$log_msg_html_2 .=  implode("<br>", $omits);
		$log_msg_html_2 .=  "<br><br>";
	}

	if ($print_results) {
		if ($cl==0) {
			print $log_msg_html_1.$log_msg_html_link.$log_msg_html_2;
		} else {
			print $log_msg_txt;
		}
		flush();
	}

	if ($log_format=="html") {
		writeToLog($log_msg_html_1.$log_msg_html_2);
	} else {
		writeToLog($log_msg_txt);
	}
}

function printPageSizeReport($pageSize) {
	global $print_results, $log_format;
	$log_msg_txt = "Size of page: $pageSize"."kb. ";
	if ($print_results) {
		print $log_msg_txt;
		flush();
	}

	writeToLog($log_msg_txt);
}

function printUrlStatus($report, $cl) {
	global $print_results, $log_format;
	$log_msg_txt = "$report\n";
	$log_msg_html = " <font color=red><b>$report</b></font><br>\n";
	if ($print_results) {
		if ($cl==0) {
			print $log_msg_html; 
		} else {
			print $log_msg_txt;
		}
		flush();
	}
	if ($log_format=="html") {
		writeToLog($log_msg_html);
	} else {
		writeToLog($log_msg_txt);
	}

}



function printConnectErrorReport($errmsg) {
	global $print_results, $log_format;
	$log_msg_txt = "Establishing connection with socket failed. ";
	$log_msg_txt .= $errmsg;

	if ($print_results) {
		print $log_msg_txt;
		flush();
	}

	writeToLog($log_msg_txt);
}



function writeToLog($msg) {
	global $keep_log, $log_handle;
	if($keep_log) {
		if (!$log_handle) {
			die ("Cannot open file for logging. ");
		}

		if (fwrite($log_handle, $msg) === FALSE) {
			die ("Cannot write to file for logging. ");
		}
	}
}


function printStandardReport($type, $cl) {
	global $print_results, $log_format, $spiderMessages;
	if ($print_results) {
		print str_replace('%cur_time', date("H:i:s"), $spiderMessages[$type][$cl]);
		flush();
	}
	if ($log_format=="html") {
		writeToLog(str_replace('%cur_time', date("H:i:s"), $spiderMessages[$type][0]));
	} else {
		writeToLog(str_replace('%cur_time', date("H:i:s"), $spiderMessages[$type][1]));
	}

}
function getFileContents($url) {
	global $user_agent;
	$urlparts = parse_url($url);
	$path = $urlparts['path'];
	$host = $urlparts['host'];
	if ($urlparts['query'] != "")
		$path .= "?".$urlparts['query'];
	if (isset ($urlparts['port'])) {
		$port = (int) $urlparts['port'];
	} else
		if ($urlparts['scheme'] == "http") {
			$port = 80;
		} else
			if ($urlparts['scheme'] == "https") {
				$port = 443;
			}

	if ($port == 80) {
		$portq = "";
	} else {
		$portq = ":$port";
	}

	$all = "*/*";

	$request = "GET $path HTTP/1.0\r\nHost: $host$portq\r\nAccept: $all\r\nUser-Agent: $user_agent\r\n\r\n";

	$fsocket_timeout = 30;
	if (substr($url, 0, 5) == "https") {
		$target = "ssl://".$host;
	} else {
		$target = $host;
	}


	$errno = 0;
	$errstr = "";
	print "siin";
	$fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);

	print $errstr;
	if (!$fp) {
		$contents['state'] = "NOHOST";
		printConnectErrorReport($errstr);
		return $contents;
	} else {
		if (!fputs($fp, $request)) {
			$contents['state'] = "Cannot send request";
			return $contents;
		}
		$data = null;
		socket_set_timeout($fp, $fsocket_timeout);
		$status = socket_get_status($fp);
		while (!feof($fp) && !$status['timed_out']) {
			$data .= fgets($fp, 8192);
		}
		fclose($fp);
		if ($status['timed_out'] == 1) {
			$contents['state'] = "timeout";
		} else
			$contents['state'] = "ok";
		$contents['file'] = substr($data, strpos($data, "\r\n\r\n") + 4);
	}
	return $contents;
}

/*
check if file is available and in readable form
*/
function url_status($url) {
	global $user_agent, $index_pdf, $index_doc, $index_xls, $index_ppt;
	$urlparts = parse_url($url);
	$path = $urlparts['path'];
	$host = $urlparts['host'];
	if (isset($urlparts['query']))
		$path .= "?".$urlparts['query'];

	if (isset ($urlparts['port'])) {
		$port = (int) $urlparts['port'];
	} else
		if ($urlparts['scheme'] == "http") {
			$port = 80;
		} else
			if ($urlparts['scheme'] == "https") {
				$port = 443;
			}

	if ($port == 80) {
		$portq = "";
	} else {
		$portq = ":$port";
	}

	$all = "*/*"; //just to prevent "comment effect" in get accept
	$request = "HEAD $path HTTP/1.1\r\nHost: $host$portq\r\nAccept: $all\r\nUser-Agent: $user_agent\r\n\r\n";

	if (substr($url, 0, 5) == "https") {
		$target = "ssl://".$host;
	} else {
		$target = $host;
	}

	$fsocket_timeout = 30;
	$errno = 0;
	$errstr = "";
	$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
	print $errstr;
	$linkstate = "ok";
	if (!$fp) {
		$status['state'] = "NOHOST";
	} else {
		socket_set_timeout($fp, 30);
		fputs($fp, $request);
		$answer = fgets($fp, 4096);
		$regs = Array ();
		if (ereg("HTTP/[0-9.]+ (([0-9])[0-9]{2})", $answer, $regs)) {
			$httpcode = $regs[2];
			$full_httpcode = $regs[1];

			if ($httpcode <> 2 && $httpcode <> 3) {
				$status['state'] = "Unreachable: http $full_httpcode";
				$linkstate = "Unreachable";
			}
		}

		if ($linkstate <> "Unreachable") {
			while ($answer) {
				$answer = fgets($fp, 4096);

				if (ereg("Location: *([^\n\r ]+)", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) {
					$status['path'] = $regs[1];
					$status['state'] = "Relocation: http $full_httpcode";
					fclose($fp);
					return $status;
				}

				if (eregi("Last-Modified: *([a-z0-9,: ]+)", $answer, $regs)) {
					$status['date'] = $regs[1];
				}

				if (eregi("Content-Type:", $answer)) {
					$content = $answer;
					$answer = '';
					break;
				}
			}
			$socket_status = socket_get_status($fp);
			if (eregi("Content-Type: *([a-z/.-]*)", $content, $regs)) {
				if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
					$status['content'] = 'text';
					$status['state'] = 'ok';
				} else if ($regs[1] == 'application/pdf' && $index_pdf == 1) {
					$status['content'] = 'pdf';
					$status['state'] = 'ok';                                 
				} else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 1) {
					$status['content'] = 'doc';
					$status['state'] = 'ok';
				} else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 1) {
					$status['content'] = 'xls';
					$status['state'] = 'ok';
				} else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 1) {
					$status['content'] = 'ppt';
					$status['state'] = 'ok';
				} else {
					$status['state'] = "Not text or html";
				}

			} else
				if ($socket_status['timed_out'] == 1) {
					$status['state'] = "Timed out (no reply from server)";

				} else
					$status['state'] = "Not text or html";

		}
	}
	fclose($fp);
	return $status;
}

/*
Read robots.txt file in the server, to find any disallowed files/folders
*/
function check_robot_txt($url) {
	global $user_agent;
	$urlparts = parse_url($url);
	$url = 'http://'.$urlparts['host']."/robots.txt";

	$url_status = url_status($url);
	$omit = array ();

	if ($url_status['state'] == "ok") {
		$robot = file($url);
		if (!$robot) {
			$contents = getFileContents($url);
			$file = $contents['file'];
			$robot = explode("\n", $file);
		}

		$regs = Array ();
		$this_agent= "";
		while (list ($id, $line) = each($robot)) {
			if (eregi("^user-agent: *([^#]+) *", $line, $regs)) {
				$this_agent = trim($regs[1]);
				if ($this_agent == '*' || $this_agent == $user_agent)
					$check = 1;
				else
					$check = 0;
			}

			if (eregi("disallow: *([^#]+)", $line, $regs) && $check == 1) {
				$disallow_str = eregi_replace("[\n ]+", "", $regs[1]);
				if (trim($disallow_str) != "") {
					$omit[] = $disallow_str;
				} else {
					if ($this_agent == '*' || $this_agent == $user_agent) {
						return null;
					}
				}
			}
		}
	}

	return $omit;
}

/*
Remove the file part from an url (to build an url from an url and given relative path)
*/
function remove_file_from_url($url) {
	$url_parts = parse_url($url);
	$path = $url_parts['path'];

	$regs = Array ();
	if (preg_match('/([^\/]+)$/i', $path, $regs)) {
		$file = $regs[1];
		$check = $file.'$';
		$path = preg_replace("/$check"."/i", "", $path);
	}

	if ($url_parts['port'] == 80 || $url_parts['port'] == "") {
		$portq = "";
	} else {
		$portq = ":".$url_parts['port'];
	}

	$url = $url_parts['scheme']."://".$url_parts['host'].$portq.$path;
	return $url;
}

/*
Extract links from html
*/
function get_links($file, $url, $can_leave_domain, $base) {

	$chunklist = array ();
    // The base URL comes from either the meta tag or the current URL.
    if (!empty($base)) {
        $url = $base;
    }

	$links = array ();
	$regs = Array ();
	$checked_urls = Array();

	preg_match_all("/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $file, $regs, PREG_SET_ORDER);
	foreach ($regs as $val) {
		if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
			if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
				$links[] = $a;
			}
			$checked_urls[$val[1]] = 1;
		}
	}
	preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
	foreach ($regs as $val) {
		if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
			if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
				$links[] = $a;
			}
			$checked_urls[$val[1]] = 1;
		}
	}
	preg_match_all("/(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
	foreach ($regs as $val) {
		if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
			if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
				$links[] = $a;
			}
			$checked_urls[$val[1]] = 1;
		}
	}
	preg_match_all("/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
	foreach ($regs as $val) {
		if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
			if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
				$links[] = $a;
			}
			$checked_urls[$val[1]] = 1;
		}
	}

	preg_match_all("/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
	foreach ($regs as $val) {
		if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
			if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
				$links[] = $a;
			}
			$checked_urls[$val[1]] = 1;
		}
	}

	return $links;
}

/*
Function to build a unique word array from the text of a webpage, together with the count of each word 
*/
function unique_array($arr) {
	global $min_word_length;
	global $common;
	global $word_upper_bound;
	global $index_numbers, $stem_words;
	
	if ($stem_words == 1) {
		$newarr = Array();
		foreach ($arr as $val) {
			$newarr[] = stem($val);
		}
		$arr = $newarr;
	}
	sort($arr);
	reset($arr);
	$newarr = array ();

	$i = 0;
	$counter = 1;
	$element = current($arr);

	if ($index_numbers == 1) {
		$pattern = "/[a-z0-9]+/";
	} else {
		$pattern = "/[a-z]+/";
	}

	$regs = Array ();
	for ($n = 0; $n < sizeof($arr); $n ++) {
		//check if word is long enough, contains alphabetic characters and is not a common word
		//to eliminate/count multiple instance of words
		$next_in_arr = next($arr);
		if ($next_in_arr != $element) {
			if (strlen($element) >= $min_word_length && preg_match($pattern, remove_accents($element)) && (@ $common[$element] <> 1)) {
				if (preg_match("/^(-|\\\')(.*)/", $element, $regs))
					$element = $regs[2];

				if (preg_match("/(.*)(\\\'|-)$/", $element, $regs))
					$element = $regs[1];

				$newarr[$i][1] = $element;
				$newarr[$i][2] = $counter;
				$element = current($arr);
				$i ++;
				$counter = 1;
			} else {
				$element = $next_in_arr;
			}
		} else {
				if ($counter < $word_upper_bound)
					$counter ++;
		}

	}
	return $newarr;
}

/*
Checks if url is legal, relative to the main url.
*/
function url_purify($url, $parent_url, $can_leave_domain) {
	global $ext, $mainurl, $apache_indexes, $strip_sessids;

	$urlparts = parse_url($url);

	$main_url_parts = parse_url($mainurl);

	if ($urlparts['host'] != "" && $urlparts['host'] != $main_url_parts['host']  && $can_leave_domain != 1) {
		return '';
	}
	if (is_array($ext)){
	reset($ext);
	while (list ($id, $excl) = each($ext))
		if (preg_match("/\.$excl$/i", $url))
			return '';

	if (substr($url, -1) == '\\') {
		return '';
	}
	}



	if (isset($urlparts['query'])) {
		if ($apache_indexes[$urlparts['query']]) {
			return '';
		}
	}

	if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) {
		return '';
	}
	if (isset($urlparts['scheme'])) {
		$scheme = $urlparts['scheme'];
	} else {
		$scheme ="";
	}



	//only http and https links are followed
	if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
		return '';
	}

	//parent url might be used to build an url from relative path
	$parent_url = remove_file_from_url($parent_url);
	$parent_url_parts = parse_url($parent_url);


	if (substr($url, 0, 1) == '/') {
		$url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$url;
	} else
		if (!isset($urlparts['scheme'])) {
			$url = $parent_url.$url;
		}

	$url_parts = parse_url($url);

	$urlpath = $url_parts['path'];

	$regs = Array ();
	
	while (preg_match("/[^\/]*\/[.]{2}\//", $urlpath, $regs)) {
		$urlpath = str_replace($regs[0], "", $urlpath);
	}

	//remove relative path instructions like ../ etc 
	$urlpath = preg_replace("/\/+/", "/", $urlpath);
	$urlpath = preg_replace("/[^\/]*\/[.]{2}/", "",  $urlpath);
	$urlpath = str_replace("./", "", $urlpath);
	$query = "";
	if (isset($url_parts['query'])) {
		$query = "?".$url_parts['query'];
	}
	if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") {
		$portq = "";
	} else {
		$portq = ":".$main_url_parts['port'];
	}
	$url = $url_parts['scheme']."://".$url_parts['host'].$portq.$urlpath.$query;

	//if we index sub-domains
	if ($can_leave_domain == 1) {
		return $url;
	}

	$mainurl = remove_file_from_url($mainurl);
	
	if ($strip_sessids == 1) {
		$url = remove_sessid($url);
	}
	//only urls in staying in the starting domain/directory are followed	
	$url = convert_url($url);
	if (strstr($url, $mainurl) == false) {
		return '';
	} else
		return $url;
}

function save_keywords($wordarray, $link_id, $domain) {
	global $mysql_table_prefix, $all_keywords;
	reset($wordarray);
	while ($thisword = each($wordarray)) {
		$word = $thisword[1][1];
		$wordmd5 = substr(md5($word), 0, 1);
		$weight = $thisword[1][2];
		if (strlen($word)<= 30) {
			$keyword_id = $all_keywords[$word];
			if ($keyword_id  == "") {
                mysql_query("insert into ".$mysql_table_prefix."keywords (keyword) values ('$word')");
				if (mysql_errno() == 1062) { 
					$result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'");
					echo mysql_error();
					$row = mysql_fetch_row($result);
					$keyword_id = $row[0];
				} else{
					$keyword_id = mysql_insert_id();
					$all_keywords[$word] = $keyword_id;
					echo mysql_error();
				} 
			} 
			if(!isset($inserts[$wordmd5])){
				$inserts[$wordmd5] = ""; 
			}
			$inserts[$wordmd5] .= ",($link_id, $keyword_id, $weight, $domain)"; 
		}
	}

	for ($i=0;$i<=15; $i++) {
		$char = dechex($i);
		$values= substr($inserts[$char], 1);
		if ($values!="") {
			$query = "insert into ".$mysql_table_prefix."link_keyword$char (link_id, keyword_id, weight, domain) values $values";
			mysql_query($query);
			echo mysql_error();
		}
		
	
	}
}

function get_head_data($file) {
	$headdata = "";
           
	preg_match("@<head[^>]*>(.*?)<\/head>@si",$file, $regs);	
	
	$headdata = $regs[1];

	$description = "";
	$robots = "";
	$keywords = "";
    $base = "";
	$res = Array ();
	if ($headdata != "") {
		preg_match("/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
		if (isset ($res)) {
			$robots = $res[1];
		}

		preg_match("/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
		if (isset ($res)) {
			$description = $res[1];
		}

		preg_match("/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
		if (isset ($res)) {
			$keywords = $res[1];
		}
        // e.g. <base href="http://www.consil.co.uk/index.php" />
		preg_match("/<base +href *= *[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
		if (isset ($res)) {
			$base = $res[1];
		}
		$keywords = preg_replace("/[, ]+/", " ", $keywords);
		$robots = explode(",", strtolower($robots));
		$nofollow = 0;
		$noindex = 0;
		foreach ($robots as $x) {
			if (trim($x) == "noindex") {
				$noindex = 1;
			}
			if (trim($x) == "nofollow") {
				$nofollow = 1;
			}
		}
		$data['description'] = addslashes($description);
		$data['keywords'] = addslashes($keywords);
		$data['nofollow'] = $nofollow;
		$data['noindex'] = $noindex;
		$data['base'] = $base;
	}
	return $data;
}

function clean_file($file, $url, $type) {
	global $entities, $index_host, $index_meta_keywords;

	$urlparts = parse_url($url);
	$host = $urlparts['host'];
	//remove filename from path
	$path = eregi_replace('([^/]+)$', "", $urlparts['path']);
	$file = preg_replace("/<link rel[^<>]*>/i", " ", $file);
	$file = preg_replace("@<!--sphider_noindex-->.*?<!--\/sphider_noindex-->@si", " ",$file);	
	$file = preg_replace("@<!--.*?-->@si", " ",$file);	
	$file = preg_replace("@<script[^>]*?>.*?</script>@si", " ",$file);
	$headdata = get_head_data($file);
	$regs = Array ();
	if (preg_match("@<title *>(.*?)<\/title*>@si", $file, $regs)) {
		$title = trim($regs[1]);
		$file = str_replace($regs[0], "", $file);
	} else if ($type == 'pdf' || $type == 'doc') { //the title of a non-html file is its first few words
		$title = substr($file, 0, strrpos(substr($file, 0, 40), " "));
	}

	$file = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $file);

	//create spaces between tags, so that removing tags doesnt concatenate strings
	$file = preg_replace("/<[\w ]+>/", "\\0 ", $file);
	$file = preg_replace("/<\/[\w ]+>/", "\\0 ", $file);
	$file = strip_tags($file);
	$file = preg_replace("/&nbsp;/", " ", $file);

	$fulltext = $file;
	echo "FULLTEXT". $fulltext;
	$file .= " ".$title;
	if ($index_host == 1) {
		$file = $file." ".$host." ".$path;
	}
	if ($index_meta_keywords == 1) {
		$file = $file." ".$headdata['keywords'];
	}
	
	
	//replace codes with ascii chars
	$file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file);
    $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file);
	$file = strtolower($file);
	reset($entities);
	while ($char = each($entities)) {
		$file = preg_replace("/".$char[0]."/i", $char[1], $file);
	}
	$file = preg_replace("/&[a-z]{1,6};/", " ", $file);
	$file = preg_replace("/[\*\^\+\?\\\.\[\]\^\$\|\{\)\(\}~!\"\/@#$%&=`;><:,]+/", " ", $file);
	$file = preg_replace("/\s+/", " ", $file);
	$data['fulltext'] = addslashes($fulltext);
	$data['content'] = addslashes($file);
	$data['title'] = addslashes($title);
	$data['description'] = $headdata['description'];
	$data['keywords'] = $headdata['keywords'];
	$data['host'] = $host;
	$data['path'] = $path;
	$data['nofollow'] = $headdata['nofollow'];
	$data['noindex'] = $headdata['noindex'];
	$data['base'] = $headdata['base'];

	return $data;

}

function calc_weights($wordarray, $title, $host, $path, $keywords) {
	global $index_host, $index_meta_keywords;
	$hostarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($host))));
	$patharray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($path))));
	$titlearray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($title))));
	$keywordsarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($keywords))));
	$path_depth = countSubstrs($path, "/");

	while (list ($wid, $word) = each($wordarray)) {
		$word_in_path = 0;
		$word_in_domain = 0;
		$word_in_title = 0;
		$meta_keyword = 0;
		if ($index_host == 1) {
			while (list ($id, $path) = each($patharray)) {
				if ($path[1] == $word[1]) {
					$word_in_path = 1;
					break;
				}
			}
			reset($patharray);

			while (list ($id, $host) = each($hostarray)) {
				if ($host[1] == $word[1]) {
					$word_in_domain = 1;
					break;
				}
			}
			reset($hostarray);
		}

		if ($index_meta_keywords == 1) {
			while (list ($id, $keyword) = each($keywordsarray)) {
				if ($keyword[1] == $word[1]) {
					$meta_keyword = 1;
					break;
				}
			}
			reset($keywordsarray);
		}
		while (list ($id, $tit) = each($titlearray)) {
			if ($tit[1] == $word[1]) {
				$word_in_title = 1;
				break;
			}
		}
		reset($titlearray);

		$wordarray[$wid][2] = (int) (calc_weight($wordarray[$wid][2], $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword));
	}
	reset($wordarray);
	return $wordarray;
}

function isDuplicateMD5($md5sum) {
	global $mysql_table_prefix;
	$result = mysql_query("select link_id from ".$mysql_table_prefix."links where md5sum='$md5sum'");
	echo mysql_error();
	if (mysql_num_rows($result) > 0) {
		return true;
	}
	return false;
}

function check_include($link, $inc, $not_inc) {
	$url_inc = Array ();
	$url_not_inc = Array ();
	if ($inc != "") {
		$url_inc = explode("\n", $inc);
	}
	if ($not_inc != "") {
		$url_not_inc = explode("\n", $not_inc);
	}
	$oklinks = Array ();

	$include = true;
	foreach ($url_not_inc as $str) {
		$str = trim($str);
		if ($str != "") {
			if (substr($str, 0, 1) == '*') {
				if (preg_match(substr($str, 1), $link)) {
					$include = false;
					break;
				}
			} else {
				if (!(strpos($link, $str) === false)) {
					$include = false;
					break;
				}
			}
		}
	}
	if ($include && $inc != "") {
		$include = false;
		foreach ($url_inc as $str) {
			$str = trim($str);
			if ($str != "") {
				if (substr($str, 0, 1) == '*') {
					if (preg_match(substr($str, 1), $link)) {
						$include = true;
						break 2;
					}
				} else {
					if (strpos($link, $str) !== false) {
						$include = true;
						break;
					}
				}
			}
		}
	}
	return $include;
}

function check_for_removal($url) {
	global $mysql_table_prefix;
	global $command_line;
	$result = mysql_query("select link_id, visible from ".$mysql_table_prefix."links"." where url='$url'");
	echo mysql_error();
	if (mysql_num_rows($result) > 0) {
		$row = mysql_fetch_row($result);
		$link_id = $row[0];
		$visible = $row[1];
		if ($visible > 0) {
			$visible --;
			mysql_query("update ".$mysql_table_prefix."links set visible=$visible where link_id=$link_id");
			echo mysql_error();
		} else {
			mysql_query("delete from ".$mysql_table_prefix."links where link_id=$link_id");
			echo mysql_error();
			for ($i=0;$i<=15; $i++) {
				$char = dechex($i);
				mysql_query("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
				echo mysql_error();
			}
			printStandardReport('pageRemoved',$command_line);
		}
	}
}

function convert_url($url) {
	$url = str_replace("&amp;", "&", $url);
	$url = str_replace(" ", "%20", $url);
	return $url;
}

function extract_text($contents, $source_type) {
	global $tmp_dir, $pdftotext_path, $catdoc_path, $xls2csv_path, $catppt_path;

	$temp_file = "tmp_file";
	$filename = $tmp_dir."/".$temp_file ;
	if (!$handle = fopen($filename, 'w')) {
		die ("Cannot open file $filename");
	}

	if (fwrite($handle, $contents) === FALSE) {
		die ("Cannot write to file $filename");
	}
	
	fclose($handle);
	if ($source_type == 'pdf') {
		$command = $pdftotext_path." $filename -";
		$a = exec($command,$result, $retval);
	} else if ($source_type == 'doc') {
		$command = $catdoc_path." $filename";
		$a = exec($command,$result, $retval);
	} else if ($source_type == 'xls') {
		$command = $xls2csv_path." $filename";
		$a = exec($command,$result, $retval);
	} else if ($source_type == 'ppt') {
		$command = $catppt_path." $filename";
		$a = exec($command,$result, $retval);
	}

	unlink ($filename);
	return implode(' ', $result); 

}

//function to calculate the weight of pages
function calc_weight ($words_in_page, $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword) {
	global $title_weight, $domain_weight, $path_weight,$meta_weight;
	$weight = ($words_in_page + $word_in_title * $title_weight +
			  $word_in_domain * $domain_weight +
			  $word_in_path * $path_weight + $meta_keyword * $meta_weight) *10 / (0.8 +0.2*$path_depth);

	return $weight;
}
 
function  remove_sessid($url) {
		return preg_replace("/(\?|&)(PHPSESSID|JSESSIONID|ASPSESSIONID|sid)=[0-9a-zA-Z]+$/", "", $url);
}

?>
