#include "udm_config.h"

#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_IO_H
#include <io.h>
#endif

#include <sys/stat.h>

#include "udm_common.h"
#include "udm_utils.h"
#include "udm_searchtool.h"
#include "udm_boolean.h"
#include "udm_xmalloc.h"
#include "udm_charset.h"
#include "udm_spell.h"
#include "udm_stopwords.h"


static const size_t search_cache_size=1000;

static int nbits[256] = {
  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

static size_t bit_count(int v)
{
  return (nbits[v&0xFF]+nbits[(v>>8)&0xFF]+nbits[(v>>16)&0xFF]+nbits[(v>>24)&0xFF]);
}


/********** QSORT functions *******************************/

static int cmpword(const void *s1,const void *s2){
int res;
	if(!(res=(((const UDM_SEARCHWORD*)s2)->count)-(((const UDM_SEARCHWORD*)s1)->count)))
		if(!(res=(((const UDM_SEARCHWORD*)s2)->pos-(((const UDM_SEARCHWORD*)s1)->pos))))
			if(!(res=(((const UDM_SEARCHWORD*)s2)->weight-(((const UDM_SEARCHWORD*)s1)->weight))))
				if(!(res=(((const UDM_SEARCHWORD*)s1)->url_id-(((const UDM_SEARCHWORD*)s2)->url_id))));
	return(res);
}

static int cmpurlid(const void *s1,const void *s2){
	int res;
	if(!(res=((const UDM_SEARCHWORD*)s1)->url_id-((const UDM_SEARCHWORD*)s2)->url_id))
		if(!(res=(((const UDM_SEARCHWORD*)s1)->pos)-(((const UDM_SEARCHWORD*)s2)->pos)));
	return(res);
}

/*****************************************************/




/***************** Results cache functions **************/

int UdmStoreToCache(UDM_AGENT * query,char * fname,UDM_SEARCHWORD * wrd, size_t nwrd){
	int fd;
	char info[1024]="";

	if(query->page_number<FAST_PRESORT_PAGES)qsort((void*)wrd,nwrd,sizeof(UDM_SEARCHWORD),cmpword);
#ifdef DEBUG_CACHE
		fprintf(stderr,"write to %s\n",fname);
#endif
		if((fd=open(fname,O_WRONLY|O_CREAT|O_TRUNC|UDM_BINARY,UDM_IWRITE))>=0){
#ifdef DEBUG_CACHE
			fprintf(stderr,"found:%d\n",nwrd);	
#endif
			write(fd,&nwrd,4);
			strncpy(info,query->wordinfo,sizeof(info));
			write(fd,info,sizeof(info));
			if((search_cache_size==0)||(nwrd<search_cache_size)){
				write(fd,wrd,nwrd*sizeof(UDM_SEARCHWORD));
			}else{
				write(fd,wrd,search_cache_size*sizeof(UDM_SEARCHWORD));
			}
			close(fd);
		}else{
#ifdef DEBUG_CACHE
			fprintf(stderr,"%s\n",strerror(errno));
#endif
		}
	UDM_FREE(wrd);
	return(0);
}

UDM_SEARCHWORD * UdmFindInCache(UDM_AGENT * Agent,char * query_id,int * nwrd){
char fname[UDMSTRSIZ];
char info[1024]="HZ";
int fd;
UDM_SEARCHWORD * wrd=NULL;
int bytes;

	*nwrd=0;
	if((search_cache_size!=0)&&((Agent->page_number+1)*Agent->page_size>search_cache_size))return(NULL);

	sprintf(fname,"%s",query_id);
	if((fd=open(fname,O_RDONLY|UDM_BINARY))<0)return(NULL);
	if(-1==read(fd,&Agent->total_found,4)){
		close(fd);
		return(NULL);
	}
	if(-1==read(fd,info,sizeof(info))){
		close(fd);
		return(NULL);
	}
	strcpy(Agent->wordinfo,info);
#ifdef DEBUG_CACHE
	fprintf(stderr,"found: %d wordinfo: %s\n",*found,info);
#endif
	wrd=(UDM_SEARCHWORD*)UdmXmalloc(Agent->page_size*sizeof(UDM_SEARCHWORD));
	if(-1==lseek(fd,(off_t)(Agent->page_number*Agent->page_size*sizeof(UDM_SEARCHWORD)),SEEK_CUR)){
		close(fd);
		return(NULL);
	}
	if(-1==(bytes=read(fd,wrd,Agent->page_size*sizeof(UDM_SEARCHWORD)))){
		close(fd);
		return(NULL);
	}
	close(fd);
	*nwrd=bytes/sizeof(UDM_SEARCHWORD);

	return(wrd);
}
/****************************************************/



void UdmSortSearchWordsByWeight(UDM_SEARCHWORD * wrd,size_t num){
	qsort((void*)wrd,num,sizeof(UDM_SEARCHWORD),cmpword);
	return;
}

void UdmSortSearchWordsByURL(UDM_SEARCHWORD * wrd,size_t num){
	qsort((void*)wrd,num,sizeof(UDM_SEARCHWORD),cmpurlid);
	return;
}

/* Find topcount best results */
void UdmWrdTopSort(UDM_SEARCHWORD * wrd, size_t nwrd,size_t topcount){
	size_t j;
	UDM_SEARCHWORD w;

#ifdef DEBUG_TOP_SORT
	fprintf(stderr,"top+1=%d nwrd=%d\n",topcount+1,nwrd);
#endif

	qsort((void*)wrd,topcount+1,sizeof(UDM_SEARCHWORD),cmpword);
	for(j=topcount;j<nwrd;j++){
		register int res;
		if(!(res=(wrd[j].count-wrd[topcount].count)))
		if(!(res=(wrd[j].weight-wrd[topcount].weight)))
		if(!(res=(wrd[topcount].url_id-wrd[j].url_id)));
#ifdef DEBUG_TOP_SORT
fprintf(stderr,"(%d,%d,%d) %d (%d,%d,%d) %d\n",
	wrd[topcount].count,wrd[topcount].weight,wrd[topcount].url_id,topcount,
	wrd[j].count,wrd[j].weight,wrd[j].url_id,j);
#endif
		if(res>0){
			size_t l,c,r;
			
			l=0;r=topcount;
			while(l<r){
				c=(l+r)/2;
				if(!(res=(wrd[c].count-wrd[j].count)))
				if(!(res=(wrd[c].weight-wrd[j].weight)))
				if(!(res=(wrd[j].url_id-wrd[c].url_id)));
				if(res>0){
					l=c+1;
				}else{
					r=c;
				}
			}
			w=wrd[topcount];
			memmove(&wrd[r+1],&wrd[r],(topcount-r)*sizeof(UDM_SEARCHWORD));
			wrd[r]=wrd[j];
			wrd[j]=w;
		}
	}
}



int UdmPrepare(UDM_AGENT * query,char * txt){
	char * lasttok;
	char * w=NULL;
	char * text;
	int lex;

	if((query->weight_factor)&&(strlen(query->weight_factor))){
		int i,len;
		char *end;

		len=strlen(query->weight_factor);
		end=query->weight_factor+len-1;
		for(i=0;(i<len)&&(i<8);i++){
			query->wf[i]=UdmHex2Int(*(end-i));
		}
	}

	/* FIXME: add free words */
	query->wordinfo[0]='\0';
	query->total_found=0;
	query->words_in_query=0;
	query->nitems=0;
	
	text=strdup(txt);
	lasttok=text;

	/* Parse query and build boolean search stack*/
	lex=UdmGetLex(&w,&lasttok,query->charset);
	while(lex!=-1){
		char ** ww, *rw;
		int len;

		if(lex==UDM_STACK_WORD){
		        if(query->Conf->ispell_mode & UDM_ISPELL_MODE_SERVER) {
			  ww = UdmNormalizeWordFromServer(query, w);
			} else {
			  ww=UdmNormalizeWord(query,w);
			}
			rw=ww?*ww:w;
			query->items[query->nitems].cmd=UDM_STACK_WORD;
			query->items[query->nitems].arg=1<<(query->words_in_query);
			query->nitems++;
		}else{
			query->items[query->nitems].cmd=lex;
			query->items[query->nitems].arg=0;
			query->nitems++;
			lex=UdmGetLex(&w,&lasttok,query->charset);
			continue;
		}
		len=strlen(rw);
		
		if(query->word_match==UDM_MATCH_WORD){
			/* Check stopword only when full word         */
			/* Substring searches should not exclude them */
			if(UdmIsStopWord(query->Conf,rw)||(query->Conf->min_word_len>len)||(query->Conf->max_word_len<len)){
				if(query->wordinfo[0])strcat(query->wordinfo,", ");
				sprintf(UDM_STREND(query->wordinfo)," %s :stopword",rw);
				lex=UdmGetLex(&w,&lasttok,query->charset);
				continue;
			}
		}
		if(query->words_in_query<UDM_MAXWORDPERQUERY-1){
			query->words[query->words_in_query]=strdup(rw);
			query->cwords[query->words_in_query]=UdmStrCRC32(rw);
			query->wordorders[query->words_in_query]=query->words_in_query;
			query->words_in_query++;
		}
		lex=UdmGetLex(&w,&lasttok,query->charset);
	}
	free(text);
	return(0);
}

static size_t UdmCalcWeight(int url_id,size_t weight,size_t uniq_words,size_t nwords){
	size_t res=0;
	
	res=(size_t)(uniq_words<<24)|(size_t)(weight<<16)|(size_t)(nwords<<8);
	return res;
}

static size_t UdmCalcPhraseWeight(int url_id,size_t weight,int uniq_words,size_t *phr,size_t words_in_query,int search_mode){
	size_t res=0;
	if(search_mode==UDM_MODE_PHRASE){
		res=phr[words_in_query-1];
	}else{
		size_t i;

		for(i=0;i<words_in_query-1;i++){
			res+=phr[i]*(i+1);
		}	
		if(res>0xFFFF)res=0xFFFF;
		if(phr[words_in_query-1]>0xFF)phr[words_in_query-1]=0xFF;
		res=(size_t)(phr[words_in_query-1]<<24)|(size_t)(uniq_words<<16)|res;
	}
	return res;
}

void UdmGroupByURL(UDM_AGENT * query,UDM_SEARCHWORD * wrd){
	size_t i,j=0;
	size_t Doc_weight=wrd[0].weight;
	size_t Doc_nwords=wrd[0].pos;
	/* BOOL stuff */
	size_t item;
	UDM_STACK_ITEM temp_items[UDM_MAXSTACK];
	/* PHRASE stuff */
	size_t *Doc_phr;
	short  phr_beg=wrd[0].pos;
	size_t phr_size=query->words_in_query*sizeof(size_t);
	int    phr_mask=wrd[0].count;
	size_t phr_weight=wrd[0].weight;

	if(query->search_mode==UDM_MODE_BOOL){
		memcpy(temp_items,query->items,query->nitems*sizeof(UDM_STACK_ITEM));
	}
	if(query->Conf->use_phrases==1){
		Doc_phr=(size_t*)malloc(phr_size);
		bzero(Doc_phr, phr_size);
	}
	for(i=1;i<query->total_found;i++){
		/* Group by url_id */
		if(wrd[j].url_id==wrd[i].url_id){
			/* Same document */
			wrd[j].count|=wrd[i].count;
			Doc_weight+=wrd[i].weight;
			if(query->Conf->use_phrases==1){
				/* Calculate phrase (subphrase) summary weight */
				if((wrd[i].pos-phr_beg)<=((bit_count(phr_mask)+1))){
					phr_mask|=wrd[i].count;
					phr_weight+=wrd[i].weight;
				}else{
					Doc_phr[bit_count(phr_mask)-1]+=phr_weight;
					phr_beg=wrd[i].pos;
					phr_mask=wrd[i].count;
					phr_weight=wrd[i].weight;
				}
			}else{
				Doc_nwords+=wrd[i].pos;
			}
		}else{
			/* Next document */
			if(query->Conf->use_phrases==1){
				Doc_phr[bit_count(phr_mask)-1]+=phr_weight;
			}

			if(query->search_mode==UDM_MODE_BOOL){
				/* Create a copy of lexem array       */
				/* and change word masks by 1 or 0    */
				/* depending on whether word presents */
				/* in the search query being executed */
				for(item=0;item<query->nitems;item++){
					if(temp_items[item].cmd==UDM_STACK_WORD){
						/* Change word mask by 1 or 0 */
						temp_items[item].arg=((query->items[item].arg)&(wrd[j].count))?1:0;
					}
				}
				if(UdmCalcBoolItems(temp_items,query->nitems)){
					wrd[j].pos=bit_count(wrd[j].count);
					if(query->Conf->use_phrases==1){
						wrd[j].count=UdmCalcPhraseWeight(wrd[j].url_id,Doc_weight,wrd[j].pos,Doc_phr,query->words_in_query,query->search_mode);
					}else{
						wrd[j].count=UdmCalcWeight(wrd[j].url_id,Doc_weight,bit_count(wrd[j].count),Doc_nwords);
					}
					j++;
				}else{
					/* Skip this result */
				}
			}else
			if((query->search_mode==UDM_MODE_ALL)&&(bit_count(wrd[j].count)<query->words_in_query)){
				/* Skip this result */
			}else
			if((query->search_mode==UDM_MODE_PHRASE)&&(Doc_phr[query->words_in_query-1]==0)){
				/* Skip this result */
			}else{
				wrd[j].pos=bit_count(wrd[j].count);
				if(query->Conf->use_phrases==1){
					wrd[j].count=UdmCalcPhraseWeight(wrd[j].url_id,Doc_weight,wrd[j].pos,Doc_phr,query->words_in_query,query->search_mode);
				}else{
					wrd[j].count=UdmCalcWeight(wrd[j].url_id,Doc_weight,bit_count(wrd[j].count),Doc_nwords);
				}
				j++;
			}
			Doc_weight=wrd[i].weight;
			if(query->Conf->use_phrases==1){
				phr_mask=wrd[i].count;
				phr_beg=wrd[i].pos;
				phr_weight=wrd[i].weight;
				bzero(Doc_phr, phr_size);
			}else{
				Doc_nwords+=wrd[i].pos;
			}
			wrd[j]=wrd[i];
		}
	}
	/* Check last word */
	if(query->Conf->use_phrases==1){
		Doc_phr[bit_count(phr_mask)-1]+=phr_weight;
	}
	switch(query->search_mode){
		case UDM_MODE_BOOL:
			for(item=0;item<query->nitems;item++){
				if(temp_items[item].cmd==UDM_STACK_WORD){
					/* Change word mask by 1 or 0 */
					temp_items[item].arg=((query->items[item].arg)&(wrd[j].count))?1:0;
				}
			}
			query->total_found=(UdmCalcBoolItems(temp_items,query->nitems))?j+1:j;
			break;
		case UDM_MODE_PHRASE:
			query->total_found=(Doc_phr[query->words_in_query-1]>0)?j+1:j;
			break;
		case UDM_MODE_ALL:
			query->total_found=(bit_count(wrd[j].count)>=query->words_in_query)?j+1:j;
			break;
		case UDM_MODE_ANY:
		default:
			query->total_found=j+1;
			break;
	}
	wrd[j].pos=bit_count(wrd[j].count);
	if(query->Conf->use_phrases==1){
		wrd[j].count=UdmCalcPhraseWeight(wrd[j].url_id,Doc_weight,wrd[j].pos,Doc_phr,query->words_in_query,query->search_mode);
		free(Doc_phr);
	}else{
		wrd[j].count=UdmCalcWeight(wrd[j].url_id,Doc_weight,bit_count(wrd[j].count),Doc_nwords);
	}
	return;
}
