/* Phyle of filogenetic tree calculating functions for CLUSTAL V */ #include #include #include #include #include "clustalv.h" /* * Prototypes */ extern void *ckalloc(size_t); extern int getint(char *, int, int, int); extern void get_path(char *, char *); extern FILE * open_output_file(char *, char *, char *, char *); void init_trees(void); void phylogenetic_tree(void); void bootstrap_tree(void); void compare_tree(char **, char **, int *, int); void tree_gap_delete(void); /*flag all positions in alignment that have a gap */ void dna_distance_matrix(FILE *); void prot_distance_matrix(FILE *); void nj_tree(char **, FILE *); void print_tree(char **, FILE *, int *); void root_tree(char **, int); Boolean transition(int,int); /* * Global variables */ extern double **tmat; /* general nxn array of reals; allocated from main */ /* this is used as a distance matrix */ extern double **smat; extern Boolean dnaflag; /* TRUE for DNA seqs; FALSE for proteins */ extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/ extern Boolean kimura; /* Use correction for multiple substitutions */ extern Boolean empty; /* any sequences in memory? */ extern Boolean usemenu; /* interactive (TRUE) or command line (FALSE) */ extern void error(char *,...); /* error reporting */ extern int nseqs; /* total no. of seqs. */ extern int *seqlen_array; /* the lengths of the sequences */ extern char **seq_array; /* the sequences */ extern char **names; /* the seq. names */ extern char seqname[]; /* name of input file */ static double *av; static int *kill; static int ran_factor; int boot_ntrials; /* number of bootstrap trials */ unsigned int boot_ran_seed; /* random number generator seed */ static int root_sequence; static int *boot_positions; static int *boot_totals; static char **standard_tree; static char **sample_tree; static FILE *phy_tree_file; static char phy_tree_name[FILENAMELEN+1]; static Boolean verbose; static char *tree_gaps; /* array of weights; 1 for use this posn.; 0 don't */ void init_trees() { register int i; tree_gaps = (char *)ckalloc( (MAXLEN+1) * sizeof (char) ); boot_positions = (int *)ckalloc( (MAXLEN+1) * sizeof (int) ); boot_totals = (int *)ckalloc( (MAXN+1) * sizeof (int) ); kill = (int *) ckalloc( (MAXN+1) * sizeof (int) ); av = (double *) ckalloc( (MAXN+1) * sizeof (double) ); standard_tree = (char **) ckalloc( (MAXN+1) * sizeof (char *) ); for(i=0; i G and T <--> C are transitions; all others are transversions. */ { if( ((base1 == 1) && (base2 == 3)) || ((base1 == 3) && (base2 == 1)) ) return TRUE; /* A <--> G */ if( ((base1 == 4) && (base2 == 2)) || ((base1 == 2) && (base2 == 4)) ) return TRUE; /* T <--> C */ return FALSE; } void tree_gap_delete() /* flag all positions in alignment that have a gap */ { /* in ANY sequence */ int seqn, posn; for(posn=1; posn<=seqlen_array[1]; ++posn) { tree_gaps[posn] = 0; for(seqn=1; seqn<=nseqs; ++seqn) { if(seq_array[seqn][posn] <= 0) { tree_gaps[posn] = 1; break; } } } } void nj_tree(char **tree_description, FILE *tree) { register int i; int l[4],nude,k; int nc,mini,minj,j,j1,ii,jj; double fnseqs,fnseqs2,sumd; double diq,djq,dij,d2r,dr,dio,djo,da; double tmin,total,dmin; double bi,bj,b1,b2,b3,branch[4]; int typei,typej,type[4]; /* 0 = node; 1 = OTU */ fnseqs = (double)nseqs; /*********************** First initialisation ***************************/ if(verbose) { fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n"); fprintf(tree,"\n Saitou, N. and Nei, M. (1987)"); fprintf(tree," The Neighbor-joining Method:"); fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees."); fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n"); fprintf(tree,"\n\n This is an UNROOTED tree\n"); fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n"); } mini = minj = 0; for(i=1;i<=nseqs;++i) { tmat[i][i] = av[i] = 0.0; kill[i] = 0; } /*********************** Enter The Main Cycle ***************************/ /* for(nc=1; nc<=(nseqs-3); ++nc) { */ /**start main cycle**/ for(nc=1; nc<=(nseqs-3); ++nc) { sumd = 0.0; for(j=2; j<=nseqs; ++j) for(i=1; i 0.0 ) typei = 0; else typei = 1; if( av[minj] > 0.0 ) typej = 0; else typej = 1; if(verbose) { fprintf(tree,"\n Cycle%4d = ",nc); if(typei == 0) fprintf(tree,"Node:%4d (%9.5f) joins ",mini,bi); else fprintf(tree," SEQ:%4d (%9.5f) joins ",mini,bi); if(typej == 0) fprintf(tree,"Node:%4d (%9.5f)",minj,bj); else fprintf(tree," SEQ:%4d (%9.5f)",minj,bj); fprintf(tree,"\n"); } for(i=1; i<=nseqs; i++) tree_description[nc][i] = 0; if(typei == 0) { for(i=nc-1; i>=1; i--) if(tree_description[i][mini] == 1) { for(j=1; j<=nseqs; j++) if(tree_description[i][j] == 1) tree_description[nc][j] = 1; break; } } else tree_description[nc][mini] = 1; if(typej == 0) { for(i=nc-1; i>=1; i--) if(tree_description[i][minj] == 1) { for(j=1; j<=nseqs; j++) if(tree_description[i][j] == 1) tree_description[nc][j] = 1; break; } } else tree_description[nc][minj] = 1; if(dmin <= 0.0) dmin = 0.0001; av[mini] = dmin * 0.5; /*........................Re-initialisation................................*/ fnseqs = fnseqs - 1.0; kill[minj] = 1; for(j=1; j<=nseqs; ++j) if( kill[j] != 1 ) { da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5; if( (mini - j) < 0 ) tmat[mini][j] = da; if( (mini - j) > 0) tmat[j][mini] = da; } for(j=1; j<=nseqs; ++j) tmat[minj][j] = tmat[j][minj] = 0.0; /****/ } /**end main cycle**/ /******************************Last Cycle (3 Seqs. left)********************/ nude = 1; for(i=1; i<=nseqs; ++i) if( kill[i] != 1 ) { l[nude] = i; nude = nude + 1; } b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5; b2 = tmat[l[1]][l[2]] - b1; b3 = tmat[l[1]][l[3]] - b1; branch[1] = b1 - av[l[1]]; branch[2] = b2 - av[l[2]]; branch[3] = b3 - av[l[3]]; for(i=1; i<=nseqs; i++) tree_description[nseqs-2][i] = 0; if(verbose) fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",nc); for(i=1; i<=3; ++i) { if( av[l[i]] > 0.0) { if(verbose) fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",l[i],branch[i]); for(k=nseqs-3; k>=1; k--) if(tree_description[k][l[i]] == 1) { for(j=1; j<=nseqs; j++) if(tree_description[k][j] == 1) tree_description[nseqs-2][j] = i; break; } } else { if(verbose) fprintf(tree,"\n\t\t SEQ:%4d (%9.5f) ",l[i],branch[i]); tree_description[nseqs-2][l[i]] = i; } if(i < 3) { if(verbose) fprintf(tree,"joins"); } } if(verbose) fprintf(tree,"\n"); } void bootstrap_tree() { int i,j,ranno; int k,l,m,p; unsigned int num; char lin2[MAXLINE],path[MAXLINE+1]; if(empty) { error("You must load an alignment first"); return; } get_path(seqname, path); if((phy_tree_file = open_output_file( "\nEnter name for bootstrap output file ",path, phy_tree_name,"njb")) == NULL) return; for(i=0;i 0) fprintf(tree,"%7d",totals[row]); } fprintf(tree," \n"); for(col=1; col<=nseqs; col++) fprintf(tree,"%1d",tree_description[nseqs-2][col]); fprintf(tree,"\n"); } void dna_distance_matrix(FILE *tree) { int m,n,j,i; int res1, res2; double p,q,e,a,b,k; tree_gap_delete(); /* flag positions with gaps (tree_gaps[i] = 1 ) */ if(verbose) { fprintf(tree,"\n"); fprintf(tree,"\n DIST = percentage divergence (/100)"); fprintf(tree,"\n p = rate of transition (A <-> G; C <-> T)"); fprintf(tree,"\n q = rate of transversion"); fprintf(tree,"\n Length = number of sites used in comparison"); fprintf(tree,"\n"); if(tossgaps) { fprintf(tree,"\n All sites with gaps (in any sequence) deleted!"); fprintf(tree,"\n"); } if(kimura) { fprintf(tree,"\n Distances corrected by Kimura's 2 parameter model:"); fprintf(tree,"\n\n Kimura, M. (1980)"); fprintf(tree," A simple method for estimating evolutionary "); fprintf(tree,"rates of base"); fprintf(tree,"\n substitutions through comparative studies of "); fprintf(tree,"nucleotide sequences."); fprintf(tree,"\n J. Mol. Evol., 16, 111-120."); fprintf(tree,"\n\n"); } } for(m=1; m 0) ) goto skip; /* gap position */ res1 = seq_array[m][j]; res2 = seq_array[n][j]; if( (res1 < 1) || (res2 < 1) ) goto skip; /* gap in a seq*/ e = e + 1.0; if(res1 != res2) { if(transition(res1,res2)) p = p + 1.0; else q = q + 1.0; } skip:; } /* Kimura's 2 parameter correction for multiple substitutions */ if(!kimura) { k = (p+q)/e; if(p > 0.0) p = p/e; else p = 0.0; if(q > 0.0) q = q/e; else q = 0.0; tmat[m][n] = tmat[n][m] = k; if(verbose) /* if screen output */ fprintf(tree, "%4d vs.%4d: DIST = %7.4f; p = %6.4f; q = %6.4f; length = %6.0f\n" ,m,n,k,p,q,e); } else { if(p > 0.0) p = p/e; else p = 0.0; if(q > 0.0) q = q/e; else q = 0.0; a = 1.0/(1.0-2.0*p-q); b = 1.0/(1.0-2.0*q); k = 0.5*log(a) + 0.25*log(b); tmat[m][n] = tmat[n][m] = k; if(verbose) /* if screen output */ fprintf(tree, "%4d vs.%4d: DIST = %7.4f; p = %6.4f; q = %6.4f; length = %6.0f\n" ,m,n,k,p,q,e); } } } void prot_distance_matrix(FILE *tree) { int m,n,j,i; int res1, res2; double p,e,a,b,k; tree_gap_delete(); /* flag positions with gaps (tree_gaps[i] = 1 ) */ if(verbose) { fprintf(tree,"\n"); fprintf(tree,"\n DIST = percentage divergence (/100)"); fprintf(tree,"\n Length = number of sites used in comparison"); fprintf(tree,"\n\n"); if(tossgaps) { fprintf(tree,"\n All sites with gaps (in any sequence) deleted"); fprintf(tree,"\n"); } if(kimura) { fprintf(tree,"\n Distances corrected by Kimura's empirical method:"); fprintf(tree,"\n\n Kimura, M. (1983)"); fprintf(tree," The Neutral Theory of Molecular Evolution."); fprintf(tree,"\n Cambridge University Press, Cambridge, England."); fprintf(tree,"\n\n"); } } for(m=1; m 0) ) goto skip; /* gap position */ res1 = seq_array[m][j]; res2 = seq_array[n][j]; if( (res1 < 1) || (res2 < 1) ) goto skip; /* gap in a seq*/ e = e + 1.0; if(res1 != res2) p = p + 1.0; skip:; } if(p <= 0.0) k = 0.0; else k = p/e; if(kimura) if(k > 0.0) k = - log(1.0 - k - (k * k/5.0) ); tmat[m][n] = tmat[n][m] = k; if(verbose) /* if screen output */ fprintf(tree, "%4d vs.%4d DIST = %6.4f; length = %6.0f\n",m,n,k,e); } }