# wersja parsera z pakietem XML
# autor: Maciej Berêsewicz
#

## podstawowe
library(RCurl)
## dodatkowe
library(XML)
library(RJSONIO)
library(plyr)
library(memisc)

N <- 1000
linki<-NULL

motos<-paste("http://otomoto.pl/index.php?sect=search&sub=car&page=",1:N,"&qid=2206521429&order_by=i",sep="")
spis<-NULL

otoMoto <- file("otoMoto.txt","w")
sink(otoMoto,append = TRUE)

for (k in 1:N){
		doc<-htmlParse(motos[k],encoding="UTF-8") ## pobieram stronÄ™
		links<-as.data.frame(unlist(xpathApply(doc,"//div[@class='boxHeadOM']//h4//a", xmlGetAttr,'href'))) ## wybieram tylko tam gdzie sÄ… odwoÅ‚ania href
		if (ncol(links)==0) next
		colnames(links)<-"link" 
		links<-links[grep('http://otomoto.pl/',links$link),] ## wybieram te dotyczÄ…ce samochodÃ³w
		links<-as.data.frame(links)
		
		for (i in 1:nrow(links)) {
			spis<-paste(spis, paste(links[i,1])) ## 
			
			if ( isTRUE(grep(links[i,1],spis)==1) ){
							spis<-gsub(links[i,1],"jest",spis)
							
							## link do samochodu
							cat("------------------------ Nowe Auto 1","\n")
							cat(paste(links[i,1]),"\n") ## 
							
							doc2<-try(htmlParse(links[i,1],encoding="UTF-8") , silent=T)
							if (class(doc2) == "try-error") next
							auta4<-xpathApply(doc2,"//ul[@id='adminNav']//span",xmlValue)
							## Marka
							cat("Marka:",auta4[[1]][1],"\nModel:",auta4[[2]][1],"\n" )
							auta5<-xpathApply(doc2,"//dd//strong",xmlValue)[[1]][1]
							auta6<-xpathApply(doc2,"//dt[@class='main']",xmlValue)									
							cena<-strsplit(auta6[[1]][1]," ")
							cena<-gsub(":","",cena[[1]][2])
							## cena
							cat("Cena:",auta5,cena,"\n")
							
							typ<-xpathApply(doc2,"//dd//span",xmlValue)
							opis<-xpathApply(doc2,"//dt", xmlValue)
							opis[[1]]<-NULL
							## opisy			
							for (j in 2:length(opis)){
								cat(opis[[j]][1], typ[[j]][1],"\n")
							}
							
							## informacje dodakowe z warunkiem na ilosÄ‡ znakÃ³w
							dodat<-xpathApply(doc2,"//div[@id='offerDetails']//p",xmlValue)
							
							if (length(dodat)>=2) {
  							cat("WyposaÅ¼enie dodatkowe:",dodat[[1]][1],"\n")
  							if ( nchar(dodat[[2]][1])<300) 
                   cat("Informacje dodatkowe:",dodat[[2]][1],"\n")
    				}
    		}
    }
}