## THIS IS FOR THE WHOLE WORLD DATA SEGREGATED INTO SUBPOPULATIONS
## IT MAKES A FINESTRUCTURE "mcmc xml" FILE FOR PCA, 
## BY ASSIGNING EACH INDIVIDUAL TO A POPULATION BASED ON IT'S LABEL
##
## YOU ARE FREE TO USE AND MODIFY THIS FILE
## THIS FILE MAY NOT BE FIT FOR ANY PURPOSE, USE AT YOUR OWN RISK (ETC)
##
## AUTHOR: DANIEL LAWSON (dan.lawson@bristol.ac.uk)
## DATE: 2/3/2012
#####################################################
## USAGE: Edit the inputfile and outputfile file names. If you use a different naming system, also edit the "popIn" function to capture this...
##
## You then need to create the finestructure tree from the .xml file either usiong the gui or the "-m T -T 1" option
## e.g. finestructure -m T -T 1 -t 20000 AllHGDPexample.chunkcounts.out AllHGDPlabelledPopulations.xml AllHGDPlabelledPopulations.tree.xml
## then you can load it into the GUI
## e.g. finegui -c AllHGDPexample.chunkcounts.out -m AllHGDPlabelledPopulations.xml -t AllHGDPlabelledPopulations.tree.xml
## Go to "File->Manage Files" and load the "Raw data file", the "MCMC output file", and the "Processed Tree file"
## (tip: change the view to "aggregated (alternative)" is the GUI is slow to respond...)
## then label the populations by going to "Organise->Edit Populations" and click "Guess" (WORKS ONLY IF YOUR NAMING SCHEME IS THE SAME AS OURS!)
## Finally, go to "Plot->Principal Components Analysis"

inputfile<-"AllHGDPexample.chunkcounts.out" # the chromopainter input file
outputfile<-"AllHGDPlabelledPopulations.xml" # the file to be generated

## This function def
popIn<-function(x){
## This function defines how a "population" is defined
## NOTE: if your individuals are named some other way, you can still use this approach.  You just have to construct 
## the "popnames" vector some other way, either by hand just by simply listing all individuals, or by writing your own "popIn" function
## to match your naming system
        tlab2<-gsub("[0-9]","",x)
        unique(tlab2)
}

## Read in the data
tdat<-as.matrix(read.csv(inputfile,row.names=1,skip=1))# skip the first line, which is the value of "c"
tnames<-dimnames(tdat)[[1]]

popnames<-sapply(tnames,popIn) # the population that each individual is in

collist<-unique(popnames) # the list of all populations

## Construct a list of the individuals found within each population
contpops<-list()
for(i in 1:length(collist)){
  index<-as.vector(unlist(sapply(collist[[i]],function(x){which(popnames %in% x)}))) # index is the individuals that match any of the labels within the population
  contpops[[i]]<-paste(tnames[index],collapse=",") # extract them into comma separated list
}

index<-sapply(contpops,function(x){which(popnames %in% x)})


tpopA<-paste("(",contpops,")",sep="")
tpop<-paste(tpopA,collapse="") # tpop is now a valid finestructure state, we just have to construct the fake mcmc file

header<-paste("<inflation>1</inflation>\n<burnin>0</burnin>\n<mcmclength>1</mcmclength>\n<skip>1</skip>\n<datafilename>",inputfile,"</datafilename>\n",sep="")
cat("<?xml version = '1.0' encoding = 'UTF-8'?>\n<outputFile>\n<header>\n<comment>Generated File</comment>\n",file=outputfile)
cat(header,file=outputfile,append=TRUE)
cat("</header>\n<Iteration>\n<Pop>",file=outputfile,append=TRUE)
cat(tpop,file=outputfile,append=TRUE)
cat("</Pop>\n</Iteration>\n</outputFile>\n",file=outputfile,append=TRUE)
