library(bioxm) args <- commandArgs(TRUE) outfile <- args[1] sessionId <- args[2] query <- args[3] view <- args[4] con <- bioxmConnect('../../bioxm2', sessionId=sessionId) df <- getData(con, query, view, with.types=TRUE) m <- as.matrix(df) group.variable.1 <- unlist(strsplit(getQueryVariable(con, query, number=1), ";")) group.variable.2 <- unlist(strsplit(getQueryVariable(con, query, number=2), ";")) group.variable.id.1 <- sapply(strsplit(group.variable.1, ":"), function(x){x[3]}) group.variable.id.2 <- sapply(strsplit(group.variable.2, ":"), function(x){x[3]}) num.samples <- length(group.variable.1) + length(group.variable.2) variable.names <- colnames(m); new.variable.names <- c(); for (name in variable.names) { if (name %in% group.variable.id.1) { new.variable.names <- append(new.variable.names, paste(name, "(Group1)")); } else if (name %in% group.variable.id.2) { new.variable.names <- append(new.variable.names, paste(name, "(Group2)")); } else { new.variable.names <- append(new.variable.names, name); } } colnames(m) <- new.variable.names; #write.table(m, file=outfile, sep="\t", quote=FALSE) loading.dir <- gsub('.out', '', outfile) dir.create(loading.dir) loading<-function(pca.result, pngfile) { par(mar=c(0,0,0,0)) GDD(file = pngfile, type = "png", width = 150, height = 180, ps = 12, bg = "white") plot(pca.result$rotation[,2], type="h", xlab='', ylab='', font.lab=2, cex.lab=0.6, cex.axis = 0.6, cex=0.8) # define the threshold for the negative loadings:k.n t.n<-pca.result$rotation[which(pca.result$rotation[,2]<0),2] k<-quantile(abs(t.n),probs = seq(0, 1, 0.01))[95] k.n<-as.numeric(k[1]) #the threshold for the positive loadings:k.p t.p<-pca.result$rotation[which(pca.result$rotation[,2]>0),2] k<-quantile(t.p,probs = seq(0, 1, 0.01))[95] k.p<-as.numeric(k[1]) #plot the lines of the threshold abline(h=0) abline(h=k.p,col="red") abline(h=-k.n,col="green") dev.off() } pca.analysis <- function(input, output.file.name.PC1, output.file.name.PC2, number.of.samples){ #input <- read.delim(input,row.names=NULL,header=T) #kegg.sub <- as.matrix(input) kegg.sub <- input #adapt the output of BioXM for PCA #there are several pathways in one row,split it: stock.list <- c() #print("start formatting") for(i in seq(kegg.sub[,3])){ kegg.tmp <- unlist(strsplit(as.character(kegg.sub[i,3]),";")) for(j in seq(kegg.tmp)){ stock.list <- rbind(stock.list,c(kegg.tmp[j],kegg.sub[i,-3])) } } #print("the input is formatted") unique.kegg.pathway.list <- unique(stock.list[,1]) #stocker for kegg pathways #stocker for the number of probes in each kegg pathway #stocker for the output files number.of.genes.stocker <- c() kegg.name.stocker <- c() final.pca.output.stocker.PC1 <- c() final.pca.output.stocker.PC2 <- c() loadings.images <- c() #pca analysis #for each kegg pathway calulates the PC1 and PC2 for( i in seq(unique.kegg.pathway.list)){ in.pathway <- which( stock.list[,1] %in% unique.kegg.pathway.list[i], arr.ind=T) # if there are more than 9 genes in a patwhay, we calculate the PC1 and PC2 if (length(in.pathway) > 9) { number.of.genes.stocker <- append(number.of.genes.stocker,length(in.pathway)) kegg.name.stocker <- append( kegg.name.stocker, unique.kegg.pathway.list[i]) pca.input <- stock.list[in.pathway,4:(number.of.samples+3)] pca.input.numeric <- matrix(0,nrow=dim(pca.input)[1], ncol=number.of.samples) #the input as numeric for(j in 1: number.of.samples){ pca.input.numeric[,j] <- as.numeric(pca.input[,j]) } pca.input <- pca.input.numeric pca.result <- prcomp(t(pca.input)) t.pca.output.stocker <- t(pca.result$x[,c(1,2)]) final.pca.output.stocker.PC1 <- rbind( final.pca.output.stocker.PC1, t.pca.output.stocker[1,] ) final.pca.output.stocker.PC2 <- rbind( final.pca.output.stocker.PC2, t.pca.output.stocker[2,] ) rand.str = gsub('0.', '', as.character(runif(1))) tempfile <- paste(loading.dir, '/loading', rand.str, sep='') loadings.images <- rbind(loadings.images, basename(tempfile)) loading(pca.result, tempfile) } } #bind the number of genes to the stockers of PC1 and PC2 final.pca.output.stocker.PC1 <- cbind(number.of.genes.stocker,final.pca.output.stocker.PC1) final.pca.output.stocker.PC2 <- cbind(number.of.genes.stocker,final.pca.output.stocker.PC2) rownames(final.pca.output.stocker.PC1) <- kegg.name.stocker rownames(final.pca.output.stocker.PC2) <- kegg.name.stocker colnames(final.pca.output.stocker.PC1) <- c(colnames(stock.list)[3], paste("PC1:", colnames(stock.list)[4:(number.of.samples+3)], sep="")) colnames(final.pca.output.stocker.PC2) <- paste("PC2:", colnames(stock.list)[3:(number.of.samples+3)], sep="") colnames(loadings.images) = 'Image' final <- cbind(final.pca.output.stocker.PC1, final.pca.output.stocker.PC2[,-1], loadings.images) write.table(final, output.file.name.PC1, sep="\t", row.names=T, quote=F) #write.table(final.pca.output.stocker.PC2,output.file.name.PC2,sep="\t",row.names=T,quote=F) } #call #pca.analysis("kegg.final.gse1786.txt" ,"PC1.yedek1.txt","PC2.yedek1.txt",24) pca.analysis(m, outfile, outfile, num.samples)