/* Phyle of filogenetic tree calculating functions for CLUSTAL W */ /* DES was here FEB. 1994 */ #include #include #include #include #include "clustalw.h" #include "dayhoff.h" /* set correction for amino acid distances >= 75% */ #include #include /* * Prototypes */ Boolean transition(sint base1, sint base2); void tree_gap_delete(void); void distance_matrix_output(FILE *ofile); void nj_tree(char **tree_description, FILE *tree); void compare_tree(char **tree1, char **tree2, sint *hits, sint n); void print_phylip_tree(char **tree_description, FILE *tree, sint bootstrap); void print_nexus_tree(char **tree_description, FILE *tree, sint bootstrap); sint two_way_split(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap); sint two_way_split_nexus(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap); void print_tree(char **tree_description, FILE *tree, sint *totals); static Boolean is_ambiguity(char c); static void overspill_message(sint overspill,sint total_dists); /* * Global variables */ extern sint max_names; extern double **tmat; /* general nxn array of reals; allocated from main */ /* this is used as a distance matrix */ extern Boolean dnaflag; /* TRUE for DNA seqs; FALSE for proteins */ extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/ extern Boolean kimura; /* Use correction for multiple substitutions */ extern Boolean output_tree_clustal; /* clustal text output for trees */ extern Boolean output_tree_phylip; /* phylip nested parentheses format */ extern Boolean output_tree_distances; /* phylip distance matrix */ extern Boolean output_tree_nexus; /* nexus format tree */ extern Boolean output_pim; /* perc identity matrix output Ramu */ extern sint bootstrap_format; /* bootstrap file format */ extern Boolean empty; /* any sequences in memory? */ extern Boolean usemenu; /* interactive (TRUE) or command line (FALSE) */ extern sint nseqs; extern sint max_aln_length; extern sint *seqlen_array; /* the lengths of the sequences */ extern char **seq_array; /* the sequences */ extern char **names; /* the seq. names */ extern char seqname[]; /* name of input file */ extern sint gap_pos1,gap_pos2; extern Boolean use_ambiguities; extern char *amino_acid_codes; static double *av; static double *left_branch, *right_branch; static double *save_left_branch, *save_right_branch; static sint *boot_totals; static sint *tkill; /* The next line is a fossil from the days of using the cc ran() static int ran_factor; */ static sint *boot_positions; static FILE *phylip_phy_tree_file; static FILE *clustal_phy_tree_file; static FILE *distances_phy_tree_file; static FILE *nexus_phy_tree_file; static FILE *pim_file; /* Ramu */ static Boolean verbose; static char *tree_gaps; static sint first_seq, last_seq; /* array of weights; 1 for use this posn.; 0 don't */ extern sint boot_ntrials; /* number of bootstrap trials */ extern unsigned sint boot_ran_seed; /* random number generator seed */ void phylogenetic_tree(char *phylip_name,char *clustal_name,char *dist_name, char *nexus_name, char *pim_name) /* Calculate a tree using the distances in the nseqs*nseqs array tmat. This is the routine for getting the REAL trees after alignment. */ { char path[FILENAMELEN+1]; sint i, j; sint overspill = 0; sint total_dists; static char **standard_tree; static char **save_tree; char lin2[10]; if(empty) { error("You must load an alignment first"); return; } if(nseqs<2) { error("Alignment has only %d sequences",nseqs); return; } first_seq=1; last_seq=nseqs; get_path(seqname,path); if(output_tree_clustal) { if (clustal_name[0]!=EOS) { if((clustal_phy_tree_file = open_explicit_file( clustal_name))==NULL) return; } else { if((clustal_phy_tree_file = open_output_file( "\nEnter name for CLUSTAL tree output file ",path, clustal_name,"nj")) == NULL) return; } } if(output_tree_phylip) { if (phylip_name[0]!=EOS) { if((phylip_phy_tree_file = open_explicit_file( phylip_name))==NULL) return; } else { if((phylip_phy_tree_file = open_output_file( "\nEnter name for PHYLIP tree output file ",path, phylip_name,"ph")) == NULL) return; } } if(output_tree_distances) { if (dist_name[0]!=EOS) { if((distances_phy_tree_file = open_explicit_file( dist_name))==NULL) return; } else { if((distances_phy_tree_file = open_output_file( "\nEnter name for distance matrix output file ",path, dist_name,"dst")) == NULL) return; } } if(output_tree_nexus) { if (nexus_name[0]!=EOS) { if((nexus_phy_tree_file = open_explicit_file( nexus_name))==NULL) return; } else { if((nexus_phy_tree_file = open_output_file( "\nEnter name for NEXUS tree output file ",path, nexus_name,"tre")) == NULL) return; } } if(output_pim) { if (pim_name[0]!=EOS) { if((pim_file = open_explicit_file( pim_name))==NULL) return; } else { if((pim_file = open_output_file( "\nEnter name for % Identity matrix output file ",path, pim_name,"pim")) == NULL) return; } } boot_positions = (sint *)ckalloc( (seqlen_array[first_seq]+2) * sizeof (sint) ); for(j=1; j<=seqlen_array[first_seq]; ++j) boot_positions[j] = j; if(output_tree_clustal) { verbose = TRUE; /* Turn on file output */ if(dnaflag) overspill = dna_distance_matrix(clustal_phy_tree_file); else overspill = prot_distance_matrix(clustal_phy_tree_file); } if(output_tree_phylip) { verbose = FALSE; /* Turn off file output */ if(dnaflag) overspill = dna_distance_matrix(phylip_phy_tree_file); else overspill = prot_distance_matrix(phylip_phy_tree_file); } if(output_tree_nexus) { verbose = FALSE; /* Turn off file output */ if(dnaflag) overspill = dna_distance_matrix(nexus_phy_tree_file); else overspill = prot_distance_matrix(nexus_phy_tree_file); } if(output_pim) { /* Ramu */ verbose = FALSE; /* Turn off file output */ if(dnaflag) calc_percidentity(pim_file); else calc_percidentity(pim_file); } if(output_tree_distances) { verbose = FALSE; /* Turn off file output */ if(dnaflag) overspill = dna_distance_matrix(distances_phy_tree_file); else overspill = prot_distance_matrix(distances_phy_tree_file); distance_matrix_output(distances_phy_tree_file); } /* check if any distances overflowed the distance corrections */ if ( overspill > 0 ) { total_dists = (nseqs*(nseqs-1))/2; overspill_message(overspill,total_dists); } if(output_tree_clustal) verbose = TRUE; /* Turn on file output */ standard_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) ); for(i=0; i G and T <--> C are transitions; all others are transversions. */ { if( ((base1 == 0) && (base2 == 6)) || ((base1 == 6) && (base2 == 0)) ) return TRUE; /* A <--> G */ if( ((base1 ==17) && (base2 == 2)) || ((base1 == 2) && (base2 ==17)) ) return TRUE; /* T <--> C */ return FALSE; } void tree_gap_delete(void) /* flag all positions in alignment that have a gap */ { /* in ANY sequence */ sint seqn; sint posn; tree_gaps = (char *)ckalloc( (max_aln_length+1) * sizeof (char) ); for(posn=1; posn<=seqlen_array[first_seq]; ++posn) { tree_gaps[posn] = 0; for(seqn=1; seqn<=last_seq-first_seq+1; ++seqn) { if((seq_array[seqn+first_seq-1][posn] == gap_pos1) || (seq_array[seqn+first_seq-1][posn] == gap_pos2)) { tree_gaps[posn] = 1; break; } } } } void distance_matrix_output(FILE *ofile) { sint i,j; fprintf(ofile,"%6d",(pint)last_seq-first_seq+1); for(i=1;i<=last_seq-first_seq+1;i++) { fprintf(ofile,"\n%-*s ",max_names,names[i]); for(j=1;j<=last_seq-first_seq+1;j++) { fprintf(ofile,"%6.3f ",tmat[i][j]); if(j % 8 == 0) { if(j!=last_seq-first_seq+1) fprintf(ofile,"\n"); if(j != last_seq-first_seq+1 ) fprintf(ofile," "); } } } } #ifdef ORIGINAL_NJ_TREE void nj_tree(char **tree_description, FILE *tree) { register int i; sint l[4],nude,k; sint nc,mini,minj,j,ii,jj; double fnseqs,fnseqs2=0,sumd; double diq,djq,dij,d2r,dr,dio,djo,da; double tmin,total,dmin; double bi,bj,b1,b2,b3,branch[4]; sint typei,typej; /* 0 = node; 1 = OTU */ fnseqs = (double)last_seq-first_seq+1; /*********************** First initialisation ***************************/ if(verbose) { fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n"); fprintf(tree,"\n Saitou, N. and Nei, M. (1987)"); fprintf(tree," The Neighbor-joining Method:"); fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees."); fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n"); fprintf(tree,"\n\n This is an UNROOTED tree\n"); fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n"); } if (fnseqs == 2) { if (verbose) fprintf(tree,"Cycle 1 = SEQ: 1 (%9.5f) joins SEQ: 2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]); return; } mini = minj = 0; left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) ); right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) ); tkill = (sint *) ckalloc( (nseqs+1) * sizeof (sint) ); av = (double *) ckalloc( (nseqs+1) * sizeof (double) ); for(i=1;i<=last_seq-first_seq+1;++i) { tmat[i][i] = av[i] = 0.0; tkill[i] = 0; } /*********************** Enter The Main Cycle ***************************/ /* for(nc=1; nc<=(last_seq-first_seq+1-3); ++nc) { */ /**start main cycle**/ for(nc=1; nc<=(last_seq-first_seq+1-3); ++nc) { sumd = 0.0; for(j=2; j<=last_seq-first_seq+1; ++j) for(i=1; i 0.0 ) typei = 0; else typei = 1; if( av[minj] > 0.0 ) typej = 0; else typej = 1; if(verbose) fprintf(tree,"\n Cycle%4d = ",(pint)nc); /* set negative branch lengths to zero. Also set any tiny positive branch lengths to zero. */ if( fabs(bi) < 0.0001) bi = 0.0; if( fabs(bj) < 0.0001) bj = 0.0; if(verbose) { if(typei == 0) fprintf(tree,"Node:%4d (%9.5f) joins ",(pint)mini,bi); else fprintf(tree," SEQ:%4d (%9.5f) joins ",(pint)mini,bi); if(typej == 0) fprintf(tree,"Node:%4d (%9.5f)",(pint)minj,bj); else fprintf(tree," SEQ:%4d (%9.5f)",(pint)minj,bj); fprintf(tree,"\n"); } left_branch[nc] = bi; right_branch[nc] = bj; for(i=1; i<=last_seq-first_seq+1; i++) tree_description[nc][i] = 0; if(typei == 0) { for(i=nc-1; i>=1; i--) if(tree_description[i][mini] == 1) { for(j=1; j<=last_seq-first_seq+1; j++) if(tree_description[i][j] == 1) tree_description[nc][j] = 1; break; } } else tree_description[nc][mini] = 1; if(typej == 0) { for(i=nc-1; i>=1; i--) if(tree_description[i][minj] == 1) { for(j=1; j<=last_seq-first_seq+1; j++) if(tree_description[i][j] == 1) tree_description[nc][j] = 1; break; } } else tree_description[nc][minj] = 1; /* Here is where the -0.00005 branch lengths come from for 3 or more identical seqs. */ /* if(dmin <= 0.0) dmin = 0.0001; */ if(dmin <= 0.0) dmin = 0.000001; av[mini] = dmin * 0.5; /*........................Re-initialisation................................*/ fnseqs = fnseqs - 1.0; tkill[minj] = 1; for(j=1; j<=last_seq-first_seq+1; ++j) if( tkill[j] != 1 ) { da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5; if( (mini - j) < 0 ) tmat[mini][j] = da; if( (mini - j) > 0) tmat[j][mini] = da; } for(j=1; j<=last_seq-first_seq+1; ++j) tmat[minj][j] = tmat[j][minj] = 0.0; /****/ } /**end main cycle**/ /******************************Last Cycle (3 Seqs. left)********************/ nude = 1; for(i=1; i<=last_seq-first_seq+1; ++i) if( tkill[i] != 1 ) { l[nude] = i; nude = nude + 1; } b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5; b2 = tmat[l[1]][l[2]] - b1; b3 = tmat[l[1]][l[3]] - b1; branch[1] = b1 - av[l[1]]; branch[2] = b2 - av[l[2]]; branch[3] = b3 - av[l[3]]; /* Reset tiny negative and positive branch lengths to zero */ if( fabs(branch[1]) < 0.0001) branch[1] = 0.0; if( fabs(branch[2]) < 0.0001) branch[2] = 0.0; if( fabs(branch[3]) < 0.0001) branch[3] = 0.0; left_branch[last_seq-first_seq+1-2] = branch[1]; left_branch[last_seq-first_seq+1-1] = branch[2]; left_branch[last_seq-first_seq+1] = branch[3]; for(i=1; i<=last_seq-first_seq+1; i++) tree_description[last_seq-first_seq+1-2][i] = 0; if(verbose) fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",(pint)nc); for(i=1; i<=3; ++i) { if( av[l[i]] > 0.0) { if(verbose) fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",(pint)l[i],branch[i]); for(k=last_seq-first_seq+1-3; k>=1; k--) if(tree_description[k][l[i]] == 1) { for(j=1; j<=last_seq-first_seq+1; j++) if(tree_description[k][j] == 1) tree_description[last_seq-first_seq+1-2][j] = i; break; } } else { if(verbose) fprintf(tree,"\n\t\t SEQ:%4d (%9.5f) ",(pint)l[i],branch[i]); tree_description[last_seq-first_seq+1-2][l[i]] = i; } if(i < 3) { if(verbose) fprintf(tree,"joins"); } } if(verbose) fprintf(tree,"\n"); } #else /* ORIGINAL_NJ_TREE */ void nj_tree(char **tree_description, FILE *tree) { void fast_nj_tree(); /*fprintf(stderr, "****** call fast_nj_tree() !!!! ******\n");*/ fast_nj_tree(tree_description, tree); } /**************************************************************************** * [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited. * written by Tadashi Koike * (takoike@genes.nig.ac.jp) ******************* * : Store the value of sum of the score to temporary array, * and use again and again. * * In the main cycle, these are calculated again and again : * diq = sum of tmat[n][ii] (n:1 to last_seq-first_seq+1), * djq = sum of tmat[n][jj] (n:1 to last_seq-first_seq+1), * dio = sum of tmat[n][mini] (n:1 to last_seq-first_seq+1), * djq = sum of tmat[n][minj] (n:1 to last_seq-first_seq+1) * // 'last_seq' and 'first_seq' are both constant values // * and the result of above calculations is always same until * a best pair of neighbour nodes is joined. * * So, we change the logic to calculate the sum[i] (=sum of tmat[n][i] * (n:1 to last_seq-first_seq+1)) and store it to array, before * beginning to find a best pair of neighbour nodes, and after that * we use them again and again. * * tmat[i][j] * 1 2 3 4 5 * +---+---+---+---+---+ * 1 | | | | | | * +---+---+---+---+---+ * 2 | | | | | | 1) calculate sum of tmat[n][i] * +---+---+---+---+---+ (n: 1 to last_seq-first_seq+1) * 3 | | | | | | 2) store that sum value to sum[i] * +---+---+---+---+---+ * 4 | | | | | | 3) use sum[i] during finding a best * +---+---+---+---+---+ pair of neibour nodes. * 5 | | | | | | * +---+---+---+---+---+ * | | | | | * V V V V V Calculate sum , and store it to sum[i] * +---+---+---+---+---+ * sum[i] | | | | | | * +---+---+---+---+---+ * * At this time, we thought that we use upper triangle of the matrix * because tmat[i][j] is equal to tmat[j][i] and tmat[i][i] is equal * to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead * of sum[i] for storing the sum value. * * tmat[i][j] * 1 2 3 4 5 sum_cols[i] * +---+---+---+---+---+ +---+ * 1 | # | # | # | # | --> | | ... sum of tmat[1][2..5] * + - +---+---+---+---+ +---+ * 2 | # | # | # | --> | | ... sum of tmat[2][3..5] * + - + - +---+---+---+ +---+ * 3 | # | # | --> | | ... sum of tmat[3][4..5] * + - + - + - +---+---+ +---+ * 4 | # | --> | | ... sum of tmat[4][5] * + - + - + - + - +---+ +---+ * 5 | --> | | ... zero * + - + - + - + - + - + +---+ * | | | | | * V V V V V Calculate sum , sotre to sum[i] * +---+---+---+---+---+ * sum_rows[i] | | | | | | * +---+---+---+---+---+ * | | | | | * | | | | +----- sum of tmat[1..4][5] * | | | +--------- sum of tmat[1..3][4] * | | +------------- sum of tmat[1..2][3] * | +----------------- sum of tmat[1][2] * +--------------------- zero * * And we use (sum_rows[i] + sum_cols[i]) instead of sum[i]. * ******************* * : We manage valid nodes with chain list, instead of * tkill[i] flag array. * * In original logic, invalid(killed?) nodes after nodes-joining * are managed with tkill[i] flag array (set to 1 when killed). * By this method, it is conspicuous to try next node but skip it * at the latter of finding a best pair of neighbor nodes. * * So, we thought that we managed valid nodes by using a chain list * as below: * * 1) declare the list structure. * struct { * sint n; // entry number of node. * void *prev; // pointer to previous entry. * void *next; // pointer to next entry. * } * 2) construct a valid node list. * * +-----+ +-----+ +-----+ +-----+ +-----+ * NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev | * | 0 | | 1 | | 2 | | 3 | | n | * | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL * +-----+ +-----+ +-----+ +-----+ +-----+ * * 3) when finding a best pair of neighbor nodes, we use * this chain list as loop counter. * * 4) If an entry was killed by node-joining, this chain list is * modified to remove that entry. * * EX) remove the entry No 2. * +-----+ +-----+ +-----+ +-----+ * NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev | * | 0 | | 1 | | 3 | | n | * | next|--->| next|-------------->| next|- - - ->| next|->NULL * +-----+ +-----+ +-----+ +-----+ * +-----+ * NULL<-|prev | * | 2 | * | next|->NULL * +-----+ * * By this method, speed is up at the latter of finding a best pair of * neighbor nodes. * ******************* * : Cut the frequency of division. * * At comparison between 'total' and 'tmin' in the main cycle, total is * divided by (2.0*fnseqs2) before comparison. If N nodes are available, * that division happen (N*(N-1))/2 order. * * We thought that the comparison relation between tmin and total/(2.0*fnseqs2) * is equal to the comparison relation between (tmin*2.0*fnseqs2) and total. * Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing * a total value and multiply tmin and (tmin*2.0*fnseqs2) instead. * ******************* * : some transformation of the equation (to cut operations). * * We transform an equation of calculating 'total' in the main cycle. * */ void fast_nj_tree(char **tree_description, FILE *tree) { register int i; sint l[4],nude,k; sint nc,mini,minj,j,ii,jj; double fnseqs,fnseqs2=0,sumd; double diq,djq,dij,d2r,dr,dio,djo,da; double tmin,total,dmin; double bi,bj,b1,b2,b3,branch[4]; sint typei,typej; /* 0 = node; 1 = OTU */ /* IMPROVEMENT 1, STEP 0 : declare variables */ double *sum_cols, *sum_rows, *join; /* IMPROVEMENT 2, STEP 0 : declare variables */ sint loop_limit; typedef struct _ValidNodeID { sint n; struct _ValidNodeID *prev; struct _ValidNodeID *next; } ValidNodeID; ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next; /* * correspondence of the loop counter variables. * i .. lpi->n, ii .. lpii->n * j .. lpj->n, jj .. lpjj->n */ fnseqs = (double)last_seq-first_seq+1; /*********************** First initialisation ***************************/ if(verbose) { fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n"); fprintf(tree,"\n Saitou, N. and Nei, M. (1987)"); fprintf(tree," The Neighbor-joining Method:"); fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees."); fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n"); fprintf(tree,"\n\n This is an UNROOTED tree\n"); fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n"); } if (fnseqs == 2) { if (verbose) fprintf(tree,"Cycle 1 = SEQ: 1 (%9.5f) joins SEQ: 2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]); return; } mini = minj = 0; left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) ); right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) ); tkill = (sint *) ckalloc( (nseqs+1) * sizeof (sint) ); av = (double *) ckalloc( (nseqs+1) * sizeof (double) ); /* IMPROVEMENT 1, STEP 1 : Allocate memory */ sum_cols = (double *) ckalloc( (nseqs+1) * sizeof (double) ); sum_rows = (double *) ckalloc( (nseqs+1) * sizeof (double) ); join = (double *) ckalloc( (nseqs+1) * sizeof (double) ); /* IMPROVEMENT 2, STEP 1 : Allocate memory */ tvalid = (ValidNodeID *) ckalloc( (nseqs+1) * sizeof (ValidNodeID) ); /* tvalid[0] is special entry in array. it points a header of valid entry list */ tvalid[0].n = 0; tvalid[0].prev = NULL; tvalid[0].next = &tvalid[1]; /* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */ for(i=1, loop_limit = last_seq-first_seq+1, lpi=&tvalid[1], lp_prev=&tvalid[0], lp_next=&tvalid[2] ; i<=loop_limit ; ++i, ++lpi, ++lp_prev, ++lp_next) { tmat[i][i] = av[i] = 0.0; tkill[i] = 0; lpi->n = i; lpi->prev = lp_prev; lpi->next = lp_next; /* IMPROVEMENT 1, STEP 2 : Initialize arrays */ sum_cols[i] = sum_rows[i] = join[i] = 0.0; } tvalid[loop_limit].next = NULL; /* * IMPROVEMENT 1, STEP 3 : Calculate the sum of score value that * is sequence[i] to others. */ sumd = 0.0; for (lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) { double tmp_sum = 0.0; j = lpj->n; /* calculate sum_rows[j] */ for (lpi=tvalid[0].next ; lpi->n < j ; lpi = lpi->next) { i = lpi->n; tmp_sum += tmat[i][j]; /* tmat[j][i] = tmat[i][j]; */ } sum_rows[j] = tmp_sum; tmp_sum = 0.0; /* Set lpi to that lpi->n is greater than j */ if ((lpi != NULL) && (lpi->n == j)) { lpi = lpi->next; } /* calculate sum_cols[j] */ for( ; lpi!=NULL ; lpi = lpi->next) { i = lpi->n; tmp_sum += tmat[j][i]; /* tmat[i][j] = tmat[j][i]; */ } sum_cols[j] = tmp_sum; } /*********************** Enter The Main Cycle ***************************/ for(nc=1, loop_limit = (last_seq-first_seq+1-3); nc<=loop_limit; ++nc) { sumd = 0.0; /* IMPROVEMENT 1, STEP 4 : use sum value */ for(lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) { sumd += sum_cols[lpj->n]; } /* IMPROVEMENT 3, STEP 0 : multiply tmin and 2*fnseqs2 */ fnseqs2 = fnseqs - 2.0; /* Set fnseqs2 at this point. */ tmin = 99999.0 * 2.0 * fnseqs2; /*.................compute SMATij values and find the smallest one ........*/ mini = minj = 0; /* jj must starts at least 2 */ if ((tvalid[0].next != NULL) && (tvalid[0].next->n == 1)) { lpjj = tvalid[0].next->next; } else { lpjj = tvalid[0].next; } for( ; lpjj != NULL; lpjj = lpjj->next) { jj = lpjj->n; for(lpii=tvalid[0].next ; lpii->n < jj ; lpii = lpii->next) { ii = lpii->n; diq = djq = 0.0; /* IMPROVEMENT 1, STEP 4 : use sum value */ diq = sum_cols[ii] + sum_rows[ii]; djq = sum_cols[jj] + sum_rows[jj]; /* * always ii < jj in this point. Use upper * triangle of score matrix. */ dij = tmat[ii][jj]; /* * IMPROVEMENT 3, STEP 1 : fnseqs2 is * already calculated. */ /* fnseqs2 = fnseqs - 2.0 */ /* IMPROVEMENT 4 : transform the equation */ /*-------------------------------------------------------------------* * OPTIMIZE of expression 'total = d2r + fnseqs2*dij + dr*2.0' * * total = d2r + fnseq2*dij + 2.0*dr * * = d2r + fnseq2*dij + 2(sumd - dij - d2r) * * = d2r + fnseq2*dij + 2*sumd - 2*dij - 2*d2r * * = fnseq2*dij + 2*sumd - 2*dij - 2*d2r + d2r * * = fnseq2*dij + 2*sumd - 2*dij - d2r * * = fnseq2*dij + 2*sumd - 2*dij - (diq + djq - 2*dij) * * = fnseq2*dij + 2*sumd - 2*dij - diq - djq + 2*dij * * = fnseq2*dij + 2*sumd - 2*dij + 2*dij - diq - djq * * = fnseq2*dij + 2*sumd - diq - djq * *-------------------------------------------------------------------*/ total = fnseqs2*dij + 2.0*sumd - diq - djq; /* * IMPROVEMENT 3, STEP 2 : abbrevlate * the division on comparison between * total and tmin. */ /* total = total / (2.0*fnseqs2); */ if(total < tmin) { tmin = total; mini = ii; minj = jj; } } } /* MEMO: always ii < jj in avobe loop, so mini < minj */ /*.................compute branch lengths and print the results ........*/ dio = djo = 0.0; /* IMPROVEMENT 1, STEP 4 : use sum value */ dio = sum_cols[mini] + sum_rows[mini]; djo = sum_cols[minj] + sum_rows[minj]; dmin = tmat[mini][minj]; dio = (dio - dmin) / fnseqs2; djo = (djo - dmin) / fnseqs2; bi = (dmin + dio - djo) * 0.5; bj = dmin - bi; bi = bi - av[mini]; bj = bj - av[minj]; if( av[mini] > 0.0 ) typei = 0; else typei = 1; if( av[minj] > 0.0 ) typej = 0; else typej = 1; if(verbose) fprintf(tree,"\n Cycle%4d = ",(pint)nc); /* set negative branch lengths to zero. Also set any tiny positive branch lengths to zero. */ if( fabs(bi) < 0.0001) bi = 0.0; if( fabs(bj) < 0.0001) bj = 0.0; if(verbose) { if(typei == 0) fprintf(tree,"Node:%4d (%9.5f) joins ",(pint)mini,bi); else fprintf(tree," SEQ:%4d (%9.5f) joins ",(pint)mini,bi); if(typej == 0) fprintf(tree,"Node:%4d (%9.5f)",(pint)minj,bj); else fprintf(tree," SEQ:%4d (%9.5f)",(pint)minj,bj); fprintf(tree,"\n"); } left_branch[nc] = bi; right_branch[nc] = bj; for(i=1; i<=last_seq-first_seq+1; i++) tree_description[nc][i] = 0; if(typei == 0) { for(i=nc-1; i>=1; i--) if(tree_description[i][mini] == 1) { for(j=1; j<=last_seq-first_seq+1; j++) if(tree_description[i][j] == 1) tree_description[nc][j] = 1; break; } } else tree_description[nc][mini] = 1; if(typej == 0) { for(i=nc-1; i>=1; i--) if(tree_description[i][minj] == 1) { for(j=1; j<=last_seq-first_seq+1; j++) if(tree_description[i][j] == 1) tree_description[nc][j] = 1; break; } } else tree_description[nc][minj] = 1; /* Here is where the -0.00005 branch lengths come from for 3 or more identical seqs. */ /* if(dmin <= 0.0) dmin = 0.0001; */ if(dmin <= 0.0) dmin = 0.000001; av[mini] = dmin * 0.5; /*........................Re-initialisation................................*/ fnseqs = fnseqs - 1.0; tkill[minj] = 1; /* IMPROVEMENT 2, STEP 3 : Remove tvalid[minj] from chain list. */ /* [ Before ] * +---------+ +---------+ +---------+ * |prev |<-------|prev |<-------|prev |<--- * | n | | n(=minj)| | n | * | next|------->| next|------->| next|---- * +---------+ +---------+ +---------+ * * [ After ] * +---------+ +---------+ * |prev |<--------------------------|prev |<--- * | n | | n | * | next|-------------------------->| next|---- * +---------+ +---------+ * +---------+ * NULL---|prev | * | n(=minj)| * | next|---NULL * +---------+ */ (tvalid[minj].prev)->next = tvalid[minj].next; if (tvalid[minj].next != NULL) { (tvalid[minj].next)->prev = tvalid[minj].prev; } tvalid[minj].prev = tvalid[minj].next = NULL; /* IMPROVEMENT 1, STEP 5 : re-calculate sum values. */ for(lpj=tvalid[0].next ; lpj != NULL ; lpj = lpj->next) { double tmp_di = 0.0; double tmp_dj = 0.0; j = lpj->n; /* * subtrace a score value related with 'minj' from * sum arrays . */ if (j < minj) { tmp_dj = tmat[j][minj]; sum_cols[j] -= tmp_dj; } else if (j > minj) { tmp_dj = tmat[minj][j]; sum_rows[j] -= tmp_dj; } /* nothing to do when j is equal to minj. */ /* * subtrace a score value related with 'mini' from * sum arrays . */ if (j < mini) { tmp_di = tmat[j][mini]; sum_cols[j] -= tmp_di; } else if (j > mini) { tmp_di = tmat[mini][j]; sum_rows[j] -= tmp_di; } /* nothing to do when j is equal to mini. */ /* * calculate a score value of the new inner node. * then, store it temporary to join[] array. */ join[j] = (tmp_dj + tmp_di) * 0.5; } /* * 1) * Set the score values (stored in join[]) into the matrix, * row/column position is 'mini'. * 2) * Add a score value of the new inner node to sum arrays. */ for(lpj=tvalid[0].next ; lpj != NULL; lpj = lpj->next) { j = lpj->n; if (j < mini) { tmat[j][mini] = join[j]; sum_cols[j] += join[j]; } else if (j > mini) { tmat[mini][j] = join[j]; sum_rows[j] += join[j]; } /* nothing to do when j is equal to mini. */ } /* Re-calculate sum_rows[mini],sum_cols[mini]. */ sum_cols[mini] = sum_rows[mini] = 0.0; /* calculate sum_rows[mini] */ da = 0.0; for(lpj=tvalid[0].next ; lpj->n < mini ; lpj = lpj->next) { da += join[lpj->n]; } sum_rows[mini] = da; /* skip if 'lpj->n' is equal to 'mini' */ if ((lpj != NULL) && (lpj->n == mini)) { lpj = lpj->next; } /* calculate sum_cols[mini] */ da = 0.0; for( ; lpj != NULL; lpj = lpj->next) { da += join[lpj->n]; } sum_cols[mini] = da; /* * Clean up sum_rows[minj], sum_cols[minj] and score matrix * related with 'minj'. */ sum_cols[minj] = sum_rows[minj] = 0.0; for(j=1; j<=last_seq-first_seq+1; ++j) tmat[minj][j] = tmat[j][minj] = join[j] = 0.0; /****/ } /**end main cycle**/ /******************************Last Cycle (3 Seqs. left)********************/ nude = 1; for(lpi=tvalid[0].next; lpi != NULL; lpi = lpi->next) { l[nude] = lpi->n; ++nude; } b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5; b2 = tmat[l[1]][l[2]] - b1; b3 = tmat[l[1]][l[3]] - b1; branch[1] = b1 - av[l[1]]; branch[2] = b2 - av[l[2]]; branch[3] = b3 - av[l[3]]; /* Reset tiny negative and positive branch lengths to zero */ if( fabs(branch[1]) < 0.0001) branch[1] = 0.0; if( fabs(branch[2]) < 0.0001) branch[2] = 0.0; if( fabs(branch[3]) < 0.0001) branch[3] = 0.0; left_branch[last_seq-first_seq+1-2] = branch[1]; left_branch[last_seq-first_seq+1-1] = branch[2]; left_branch[last_seq-first_seq+1] = branch[3]; for(i=1; i<=last_seq-first_seq+1; i++) tree_description[last_seq-first_seq+1-2][i] = 0; if(verbose) fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",(pint)nc); for(i=1; i<=3; ++i) { if( av[l[i]] > 0.0) { if(verbose) fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",(pint)l[i],branch[i]); for(k=last_seq-first_seq+1-3; k>=1; k--) if(tree_description[k][l[i]] == 1) { for(j=1; j<=last_seq-first_seq+1; j++) if(tree_description[k][j] == 1) tree_description[last_seq-first_seq+1-2][j] = i; break; } } else { if(verbose) fprintf(tree,"\n\t\t SEQ:%4d (%9.5f) ",(pint)l[i],branch[i]); tree_description[last_seq-first_seq+1-2][l[i]] = i; } if(i < 3) { if(verbose) fprintf(tree,"joins"); } } if(verbose) fprintf(tree,"\n"); /* IMPROVEMENT 1, STEP 6 : release memory area */ ckfree(sum_cols); ckfree(sum_rows); ckfree(join); /* IMPROVEMENT 2, STEP 4 : release memory area */ ckfree(tvalid); } #endif /* ORIGINAL_NJ_TREE */ void bootstrap_tree(char *phylip_name,char *clustal_name, char *nexus_name) { sint i,j; int ranno; char path[MAXLINE+1]; char dummy[10]; char err_mess[1024]; static char **sample_tree; static char **standard_tree; static char **save_tree; sint total_dists, overspill = 0, total_overspill = 0; sint nfails = 0; if(empty) { error("You must load an alignment first"); return; } if(nseqs<4) { error("Alignment has only %d sequences",nseqs); return; } if(!output_tree_clustal && !output_tree_phylip && !output_tree_nexus) { error("You must select either clustal or phylip or nexus tree output format"); return; } get_path(seqname, path); if (output_tree_clustal) { if (clustal_name[0]!=EOS) { if((clustal_phy_tree_file = open_explicit_file( clustal_name))==NULL) return; } else { if((clustal_phy_tree_file = open_output_file( "\nEnter name for bootstrap output file ",path, clustal_name,"njb")) == NULL) return; } } first_seq=1; last_seq=nseqs; if (output_tree_phylip) { if (phylip_name[0]!=EOS) { if((phylip_phy_tree_file = open_explicit_file( phylip_name))==NULL) return; } else { if((phylip_phy_tree_file = open_output_file( "\nEnter name for bootstrap output file ",path, phylip_name,"phb")) == NULL) return; } } if (output_tree_nexus) { if (nexus_name[0]!=EOS) { if((nexus_phy_tree_file = open_explicit_file( nexus_name))==NULL) return; } else { if((nexus_phy_tree_file = open_output_file( "\nEnter name for bootstrap output file ",path, nexus_name,"treb")) == NULL) return; } } boot_totals = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); for(i=0;i 0 ) { total_dists = (nseqs*(nseqs-1))/2; overspill_message(overspill,total_dists); } tree_gaps=ckfree((void *)tree_gaps); if (output_tree_clustal) verbose = TRUE; /* Turn on screen output */ standard_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) ); for(i=0; i 0) { total_overspill = total_overspill + overspill; nfails++; } tree_gaps=ckfree((void *)tree_gaps); if(output_tree_clustal || output_tree_phylip || output_tree_nexus) nj_tree(sample_tree,clustal_phy_tree_file); left_branch=ckfree((void *)left_branch); right_branch=ckfree((void *)right_branch); tkill=ckfree((void *)tkill); av=ckfree((void *)av); compare_tree(standard_tree, sample_tree, boot_totals, last_seq-first_seq+1); if (usemenu) { if(i % 10 == 0) fprintf(stdout,"."); if(i % 100 == 0) fprintf(stdout,"\n"); } } /* check if any distances overflowed the distance corrections */ if ( nfails > 0 ) { total_dists = (nseqs*(nseqs-1))/2; fprintf(stdout,"\n"); fprintf(stdout,"\n WARNING: %ld of the distances out of a total of %ld times %ld", (long)total_overspill,(long)total_dists,(long)boot_ntrials); fprintf(stdout,"\n were out of range for the distance correction."); fprintf(stdout,"\n This affected %d out of %d bootstrap trials.", (pint)nfails,(pint)boot_ntrials); fprintf(stdout,"\n This may not be fatal but you have been warned!"); fprintf(stdout,"\n"); fprintf(stdout,"\n SUGGESTIONS: 1) turn off the correction"); fprintf(stdout,"\n or 2) remove the most distant sequences"); fprintf(stdout,"\n or 3) use the PHYLIP package."); fprintf(stdout,"\n\n"); if (usemenu) getstr("Press [RETURN] to continue",dummy); } boot_positions=ckfree((void *)boot_positions); for (i=1;i0) && (boot_totals[old_row]>0)) fprintf(tree,"[%d]",(pint)boot_totals[old_row]); fprintf(tree,","); old_row=two_way_split_nexus(tree_description, tree, last_seq-first_seq+1-2,2,bootstrap); fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-1]); if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0)) fprintf(tree,"[%d]",(pint)boot_totals[old_row]); fprintf(tree,","); old_row=two_way_split_nexus(tree_description, tree, last_seq-first_seq+1-2,3,bootstrap); fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1]); if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0)) fprintf(tree,"[%d]",(pint)boot_totals[old_row]); fprintf(tree,")"); if (bootstrap==BS_NODE_LABELS) fprintf(tree,"TRICHOTOMY"); fprintf(tree,";"); } fprintf(tree,"\nENDBLOCK;\n"); } sint two_way_split_nexus (char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap) { sint row, new_row = 0, old_row, col, test_col = 0; Boolean single_seq; if(start_row != last_seq-first_seq+1-2) fprintf(tree,"("); for(col=1; col<=last_seq-first_seq+1; col++) { if(tree_description[start_row][col] == flag) { test_col = col; break; } } single_seq = TRUE; for(row=start_row-1; row>=1; row--) if(tree_description[row][test_col] == 1) { single_seq = FALSE; new_row = row; break; } if(single_seq) { tree_description[start_row][test_col] = 0; fprintf(tree,"%d",test_col+first_seq-1); if(start_row == last_seq-first_seq+1-2) { return(0); } fprintf(tree,":%7.5f,",left_branch[start_row]); } else { for(col=1; col<=last_seq-first_seq+1; col++) { if((tree_description[start_row][col]==1)&& (tree_description[new_row][col]==1)) tree_description[start_row][col] = 0; } old_row=two_way_split_nexus(tree_description, tree, new_row, (sint)1, bootstrap); if(start_row == last_seq-first_seq+1-2) { return(new_row); } fprintf(tree,":%7.5f",left_branch[start_row]); if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0)) fprintf(tree,"[%d]",(pint)boot_totals[old_row]); fprintf(tree,","); } for(col=1; col<=last_seq-first_seq+1; col++) if(tree_description[start_row][col] == flag) { test_col = col; break; } single_seq = TRUE; new_row = 0; for(row=start_row-1; row>=1; row--) if(tree_description[row][test_col] == 1) { single_seq = FALSE; new_row = row; break; } if(single_seq) { tree_description[start_row][test_col] = 0; fprintf(tree,"%d",test_col+first_seq-1); fprintf(tree,":%7.5f)",right_branch[start_row]); } else { for(col=1; col<=last_seq-first_seq+1; col++) { if((tree_description[start_row][col]==1)&& (tree_description[new_row][col]==1)) tree_description[start_row][col] = 0; } old_row=two_way_split_nexus(tree_description, tree, new_row, (sint)1, bootstrap); fprintf(tree,":%7.5f",right_branch[start_row]); if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0)) fprintf(tree,"[%d]",(pint)boot_totals[old_row]); fprintf(tree,")"); } if ((bootstrap==BS_NODE_LABELS) && (boot_totals[start_row]>0)) fprintf(tree,"%d",(pint)boot_totals[start_row]); return(start_row); } void print_phylip_tree(char **tree_description, FILE *tree, sint bootstrap) { sint old_row; if(last_seq-first_seq+1==2) { fprintf(tree,"(%s:%7.5f,%s:%7.5f);",names[first_seq],tmat[first_seq][first_seq+1],names[first_seq+1],tmat[first_seq][first_seq+1]); return; } fprintf(tree,"(\n"); old_row=two_way_split(tree_description, tree, last_seq-first_seq+1-2,1,bootstrap); fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-2]); if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0)) fprintf(tree,"[%d]",(pint)boot_totals[old_row]); fprintf(tree,",\n"); old_row=two_way_split(tree_description, tree, last_seq-first_seq+1-2,2,bootstrap); fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-1]); if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0)) fprintf(tree,"[%d]",(pint)boot_totals[old_row]); fprintf(tree,",\n"); old_row=two_way_split(tree_description, tree, last_seq-first_seq+1-2,3,bootstrap); fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1]); if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0)) fprintf(tree,"[%d]",(pint)boot_totals[old_row]); fprintf(tree,")"); if (bootstrap==BS_NODE_LABELS) fprintf(tree,"TRICHOTOMY"); fprintf(tree,";\n"); } sint two_way_split (char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap) { sint row, new_row = 0, old_row, col, test_col = 0; Boolean single_seq; if(start_row != last_seq-first_seq+1-2) fprintf(tree,"(\n"); for(col=1; col<=last_seq-first_seq+1; col++) { if(tree_description[start_row][col] == flag) { test_col = col; break; } } single_seq = TRUE; for(row=start_row-1; row>=1; row--) if(tree_description[row][test_col] == 1) { single_seq = FALSE; new_row = row; break; } if(single_seq) { tree_description[start_row][test_col] = 0; fprintf(tree,"%.*s",max_names,names[test_col+first_seq-1]); if(start_row == last_seq-first_seq+1-2) { return(0); } fprintf(tree,":%7.5f,\n",left_branch[start_row]); } else { for(col=1; col<=last_seq-first_seq+1; col++) { if((tree_description[start_row][col]==1)&& (tree_description[new_row][col]==1)) tree_description[start_row][col] = 0; } old_row=two_way_split(tree_description, tree, new_row, (sint)1, bootstrap); if(start_row == last_seq-first_seq+1-2) { return(new_row); } fprintf(tree,":%7.5f",left_branch[start_row]); if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0)) fprintf(tree,"[%d]",(pint)boot_totals[old_row]); fprintf(tree,",\n"); } for(col=1; col<=last_seq-first_seq+1; col++) if(tree_description[start_row][col] == flag) { test_col = col; break; } single_seq = TRUE; new_row = 0; for(row=start_row-1; row>=1; row--) if(tree_description[row][test_col] == 1) { single_seq = FALSE; new_row = row; break; } if(single_seq) { tree_description[start_row][test_col] = 0; fprintf(tree,"%.*s",max_names,names[test_col+first_seq-1]); fprintf(tree,":%7.5f)\n",right_branch[start_row]); } else { for(col=1; col<=last_seq-first_seq+1; col++) { if((tree_description[start_row][col]==1)&& (tree_description[new_row][col]==1)) tree_description[start_row][col] = 0; } old_row=two_way_split(tree_description, tree, new_row, (sint)1, bootstrap); fprintf(tree,":%7.5f",right_branch[start_row]); if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0)) fprintf(tree,"[%d]",(pint)boot_totals[old_row]); fprintf(tree,")\n"); } if ((bootstrap==BS_NODE_LABELS) && (boot_totals[start_row]>0)) fprintf(tree,"%d",(pint)boot_totals[start_row]); return(start_row); } void print_tree(char **tree_description, FILE *tree, sint *totals) { sint row,col; fprintf(tree,"\n"); for(row=1; row<=last_seq-first_seq+1-3; row++) { fprintf(tree," \n"); for(col=1; col<=last_seq-first_seq+1; col++) { if(tree_description[row][col] == 0) fprintf(tree,"*"); else fprintf(tree,"."); } if(totals[row] > 0) fprintf(tree,"%7d",(pint)totals[row]); } fprintf(tree," \n"); for(col=1; col<=last_seq-first_seq+1; col++) fprintf(tree,"%1d",(pint)tree_description[last_seq-first_seq+1-2][col]); fprintf(tree,"\n"); } sint dna_distance_matrix(FILE *tree) { sint m,n; sint j,i; sint res1, res2; sint overspill = 0; double p,q,e,a,b,k; tree_gap_delete(); /* flag positions with gaps (tree_gaps[i] = 1 ) */ if(verbose) { fprintf(tree,"\n"); fprintf(tree,"\n DIST = percentage divergence (/100)"); fprintf(tree,"\n p = rate of transition (A <-> G; C <-> T)"); fprintf(tree,"\n q = rate of transversion"); fprintf(tree,"\n Length = number of sites used in comparison"); fprintf(tree,"\n"); if(tossgaps) { fprintf(tree,"\n All sites with gaps (in any sequence) deleted!"); fprintf(tree,"\n"); } if(kimura) { fprintf(tree,"\n Distances corrected by Kimura's 2 parameter model:"); fprintf(tree,"\n\n Kimura, M. (1980)"); fprintf(tree," A simple method for estimating evolutionary "); fprintf(tree,"rates of base"); fprintf(tree,"\n substitutions through comparative studies of "); fprintf(tree,"nucleotide sequences."); fprintf(tree,"\n J. Mol. Evol., 16, 111-120."); fprintf(tree,"\n\n"); } } for(m=1; m 0) ) goto skip; /* gap position */ res1 = seq_array[m+first_seq-1][j]; res2 = seq_array[n+first_seq-1][j]; if( (res1 == gap_pos1) || (res1 == gap_pos2) || (res2 == gap_pos1) || (res2 == gap_pos2)) goto skip; /* gap in a seq*/ if(!use_ambiguities) if( is_ambiguity(res1) || is_ambiguity(res2)) goto skip; /* ambiguity code in a seq*/ e = e + 1.0; if(res1 != res2) { if(transition(res1,res2)) p = p + 1.0; else q = q + 1.0; } skip:; } /* Kimura's 2 parameter correction for multiple substitutions */ if(!kimura) { if (e == 0) { fprintf(stdout,"\n WARNING: sequences %d and %d are non-overlapping\n",m,n); k = 0.0; p = 0.0; q = 0.0; } else { k = (p+q)/e; if(p > 0.0) p = p/e; else p = 0.0; if(q > 0.0) q = q/e; else q = 0.0; } tmat[m][n] = tmat[n][m] = k; if(verbose) /* if screen output */ fprintf(tree, "%4d vs.%4d: DIST = %7.4f; p = %6.4f; q = %6.4f; length = %6.0f\n" ,(pint)m,(pint)n,k,p,q,e); } else { if (e == 0) { fprintf(stdout,"\n WARNING: sequences %d and %d are non-overlapping\n",m,n); p = 0.0; q = 0.0; } else { if(p > 0.0) p = p/e; else p = 0.0; if(q > 0.0) q = q/e; else q = 0.0; } if( ((2.0*p)+q) == 1.0 ) a = 0.0; else a = 1.0/(1.0-(2.0*p)-q); if( q == 0.5 ) b = 0.0; else b = 1.0/(1.0-(2.0*q)); /* watch for values going off the scale for the correction. */ if( (a<=0.0) || (b<=0.0) ) { overspill++; k = 3.5; /* arbitrary high score */ } else k = 0.5*log(a) + 0.25*log(b); tmat[m][n] = tmat[n][m] = k; if(verbose) /* if screen output */ fprintf(tree, "%4d vs.%4d: DIST = %7.4f; p = %6.4f; q = %6.4f; length = %6.0f\n" ,(pint)m,(pint)n,k,p,q,e); } } return overspill; /* return the number of off-scale values */ } sint prot_distance_matrix(FILE *tree) { sint m,n; sint j,i; sint res1, res2; sint overspill = 0; double p,e,k, table_entry; tree_gap_delete(); /* flag positions with gaps (tree_gaps[i] = 1 ) */ if(verbose) { fprintf(tree,"\n"); fprintf(tree,"\n DIST = percentage divergence (/100)"); fprintf(tree,"\n Length = number of sites used in comparison"); fprintf(tree,"\n\n"); if(tossgaps) { fprintf(tree,"\n All sites with gaps (in any sequence) deleted"); fprintf(tree,"\n"); } if(kimura) { fprintf(tree,"\n Distances up tp 0.75 corrected by Kimura's empirical method:"); fprintf(tree,"\n\n Kimura, M. (1983)"); fprintf(tree," The Neutral Theory of Molecular Evolution."); fprintf(tree,"\n Page 75. Cambridge University Press, Cambridge, England."); fprintf(tree,"\n\n"); } } for(m=1; m 0) ) goto skip; /* gap position */ res1 = seq_array[m][j]; res2 = seq_array[n][j]; if( (res1 == gap_pos1) || (res1 == gap_pos2) || (res2 == gap_pos1) || (res2 == gap_pos2)) goto skip; /* gap in a seq*/ e = e + 1.0; if(res1 != res2) p = p + 1.0; skip:; } if(p <= 0.0) k = 0.0; else k = p/e; /* DES debug */ /* fprintf(stdout,"Seq1=%4d Seq2=%4d k =%7.4f \n",(pint)m,(pint)n,k); */ /* DES debug */ if(kimura) { if(k < 0.75) { /* use Kimura's formula */ if(k > 0.0) k = - log(1.0 - k - (k * k/5.0) ); } else { if(k > 0.930) { overspill++; k = 10.0; /* arbitrarily set to 1000% */ } else { table_entry = (k*1000.0) - 750.0; k = (double)dayhoff_pams[(int)table_entry]; k = k/100.0; } } } tmat[m][n] = tmat[n][m] = k; if(verbose) /* if screen output */ fprintf(tree, "%4d vs.%4d DIST = %6.4f; length = %6.0f\n", (pint)m,(pint)n,k,e); } return overspill; } void guide_tree(FILE *tree,sint firstseq,sint numseqs) /* Routine for producing unrooted NJ trees from seperately aligned pairwise distances. This produces the GUIDE DENDROGRAMS in PHYLIP format. */ { static char **standard_tree; sint i; float dist; phylip_phy_tree_file=tree; verbose = FALSE; first_seq=firstseq; last_seq=first_seq+numseqs-1; if(numseqs==2) { dist=tmat[firstseq][firstseq+1]/2.0; fprintf(tree,"(%s:%0.5f,%s:%0.5f);\n", names[firstseq],dist,names[firstseq+1],dist); } else { standard_tree = (char **) ckalloc( (last_seq-first_seq+2) * sizeof (char *) ); for(i=0; i