/********* Sequence input routines for CLUSTAL W *******************/ /* DES was here. FEB. 1994 */ /* Now reads PILEUP/MSF and CLUSTAL alignment files */ #include #include #include #include #include "clustalw.h" #define MIN(a,b) ((a)<(b)?(a):(b)) /* * Prototypes */ static char * get_seq(char *sname,sint *len,char *tit,Boolean dnaflag); static char * get_clustal_seq(char *sname,sint *len,sint seqno); static char * get_msf_seq(char *sname,sint *len,sint seqno); static void check_infile(sint *noseqs,Boolean *dnaflag); static void p_encode(char *seq, char *naseq, sint l); static void n_encode(char *seq, char *naseq, sint l); static sint res_index(char *codes,char c); static Boolean check_dnaflag(char *seq, sint len); static sint count_clustal_seqs(void); static sint count_pir_seqs(void); static sint count_msf_seqs(void); static sint count_rsf_seqs(void); static void get_swiss_feature(char *line,sint len,SEC_STRUCT_DATAPTR ss); static void get_rsf_feature(char *line,sint len,SEC_STRUCT_DATAPTR ss); static void get_swiss_mask(char *line,sint len,SEC_STRUCT_DATAPTR ss); static void get_clustal_ss(SEC_STRUCT_DATAPTR ss, sint length); static void get_embl_ss(SEC_STRUCT_DATAPTR ss, sint length); static void get_rsf_ss(SEC_STRUCT_DATAPTR ss, sint length); static void get_gde_ss(SEC_STRUCT_DATAPTR ss, sint length); static Boolean cl_blankline(char *line); static free_ss(SEC_STRUCT_DATAPTR ss); /* * Global variables */ FILE *fin; static sint debug; static sint seqFormat; static char *formatNames[] = {"unknown","EMBL/Swiss-Prot","PIR", "Pearson","GDE","Clustal","Pileup/MSF","RSF","USER"}; static char * get_msf_seq(char *sname,sint *len,sint seqno) /* read the seqno_th. sequence from a PILEUP multiple alignment file */ { static char line[MAXLINE+1]; char *seq = NULL; sint i,j,k; unsigned char c; fseek(fin,0,0); /* start at the beginning */ *len=0; /* initialise length to zero */ for(i=0;;i++) { if(fgets(line,MAXLINE+1,fin)==NULL) return NULL; /* read the title*/ if(linetype(line,"//") ) break; /* lines...ignore*/ } while (fgets(line,MAXLINE+1,fin) != NULL) { if(!blankline(line)) { for(i=0;istruct_penalties = SECST; struct_index = ix; for (i=0;isec_struct_mask[i] = '.'; ss->gap_penalty_mask[i] = '.'; } strcpy(ss->name,sname); for(i=0;len < length;i++) { c = tseq[i]; if(c == '\n' || c == EOS) break; /* EOL */ if (!isspace(c)) ss->sec_struct_mask[len++] = c; } } } else if(strncmp(line,"!GM",3) == 0) { ix++; sscanf(line+4,"%s%s",sname,tseq); for(j=0;jstruct_penalties = GMASK; struct_index = ix; for (i=0;igap_penalty_mask[i] = '1'; strcpy(ss->name,sname); for(i=0;len < length;i++) { c = tseq[i]; if(c == '\n' || c == EOS) break; /* EOL */ if (!isspace(c)) ss->gap_penalty_mask[len++] = c; } } } if (ss->struct_penalties != NONE) break; if(fgets(line,MAXLINE+1,fin)==NULL) return; } if (ss->struct_penalties == NONE) return; /* skip any more comment lines */ while (line[0] == '!') { if(fgets(line,MAXLINE+1,fin)==NULL) return; } /* skip the sequence lines and any comments after the alignment */ for (;;) { if(isspace(line[0])) break; if(fgets(line,MAXLINE+1,fin)==NULL) return; } /* read the rest of the alignment */ for (;;) { /* skip any blank lines */ for (;;) { if(!blankline(line)) break; if(fgets(line,MAXLINE+1,fin)==NULL) return; } /* get structure table line */ for(ix=0;ixstruct_penalties == SECST) error("bad secondary structure format"); else error("bad gap penalty mask format"); ss->struct_penalties = NONE; return; } if(fgets(line,MAXLINE+1,fin)==NULL) return; } if(ss->struct_penalties == SECST) { if (strncmp(line,"!SS",3) != 0) { error("bad secondary structure format"); ss->struct_penalties = NONE; return; } sscanf(line+4,"%s%s",sname,tseq); for(i=0;len < length;i++) { c = tseq[i]; if(c == '\n' || c == EOS) break; /* EOL */ if (!isspace(c)) ss->sec_struct_mask[len++] = c; } } else if (ss->struct_penalties == GMASK) { if (strncmp(line,"!GM",3) != 0) { error("bad gap penalty mask format"); ss->struct_penalties = NONE; return; } sscanf(line+4,"%s%s",sname,tseq); for(i=0;len < length;i++) { c = tseq[i]; if(c == '\n' || c == EOS) break; /* EOL */ if (!isspace(c)) ss->gap_penalty_mask[len++] = c; } } /* skip any more comment lines */ while (line[0] == '!') { if(fgets(line,MAXLINE+1,fin)==NULL) return; } /* skip the sequence lines */ for (;;) { if(isspace(line[0])) break; if(fgets(line,MAXLINE+1,fin)==NULL) return; } } } static void get_embl_ss(SEC_STRUCT_DATAPTR ss, sint length) { static char title[MAXLINE+1]; static char line[MAXLINE+1]; static char lin2[MAXLINE+1]; static char sname[MAXNAMES+1]; char feature[MAXLINE+1]; sint i; /* find the start of the sequence entry */ for (;;) { while( !linetype(line,"ID") ) if (fgets(line,MAXLINE+1,fin) == NULL) return; for(i=5;i<=strlen(line);i++) /* DES */ if(line[i] != ' ') break; strncpy(sname,line+i,MAXNAMES); /* remember entryname */ for(i=0;i<=strlen(sname);i++) if(sname[i] == ' ') { sname[i]=EOS; break; } sname[MAXNAMES]=EOS; rtrim(sname); blank_to_(sname); /* look for secondary structure feature table / gap penalty mask */ while(fgets(line,MAXLINE+1,fin) != NULL) { if (linetype(line,"FT")) { sscanf(line+2,"%s",feature); if (strcmp(feature,"HELIX") == 0 || strcmp(feature,"STRAND") == 0) { strcpy(title,"Found secondary structure in alignment file: "); strcat(title,sname); (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); if ((*lin2 != 'n') && (*lin2 != 'N')) { ss->struct_penalties = SECST; for (i=0;isec_struct_mask[i] = '.'; do { get_swiss_feature(&line[2],length,ss); fgets(line,MAXLINE+1,fin); } while( linetype(line,"FT") ); } else { do { fgets(line,MAXLINE+1,fin); } while( linetype(line,"FT") ); } strcpy(ss->name,sname); } } else if (linetype(line,"GM")) { strcpy(title,"Found gap penalty mask in alignment file: "); strcat(title,sname); (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); if ((*lin2 != 'n') && (*lin2 != 'N')) { ss->struct_penalties = GMASK; for (i=0;igap_penalty_mask[i] = '1'; do { get_swiss_mask(&line[2],length,ss); fgets(line,MAXLINE+1,fin); } while( linetype(line,"GM") ); } else { do { fgets(line,MAXLINE+1,fin); } while( linetype(line,"GM") ); } strcpy(ss->name,sname); } if (linetype(line,"SQ")) break; if (ss->struct_penalties != NONE) break; } } } static void get_rsf_ss(SEC_STRUCT_DATAPTR ss, sint length) { static char title[MAXLINE+1]; static char line[MAXLINE+1]; static char lin2[MAXLINE+1]; static char sname[MAXNAMES+1]; sint i; /* skip the comments */ while (fgets(line,MAXLINE+1,fin) != NULL) { if(line[strlen(line)-2]=='.' && line[strlen(line)-3]=='.') break; } /* find the start of the sequence entry */ for (;;) { while (fgets(line,MAXLINE+1,fin) != NULL) if( *line == '{' ) break; while( !keyword(line,"name") ) if (fgets(line,MAXLINE+1,fin) == NULL) return; for(i=5;i<=strlen(line);i++) /* DES */ if(line[i] != ' ') break; strncpy(sname,line+i,MAXNAMES); /* remember entryname */ for(i=0;i<=strlen(sname);i++) if(sname[i] == ' ') { sname[i]=EOS; break; } sname[MAXNAMES]=EOS; rtrim(sname); blank_to_(sname); /* look for secondary structure feature table / gap penalty mask */ while(fgets(line,MAXLINE+1,fin) != NULL) { if (keyword(line,"feature")) { strcpy(title,"Found secondary structure in alignment file: "); strcat(title,sname); (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); if ((*lin2 != 'n') && (*lin2 != 'N')) { ss->struct_penalties = SECST; for (i=0;isec_struct_mask[i] = '.'; do { if(keyword(line,"feature")) get_rsf_feature(&line[7],length,ss); fgets(line,MAXLINE+1,fin); } while( !keyword(line,"sequence") ); } else { do { fgets(line,MAXLINE+1,fin); } while( !keyword(line,"sequence") ); } strcpy(ss->name,sname); } else if (keyword(line,"sequence")) break; if (ss->struct_penalties != NONE) break; } } } static void get_gde_ss(SEC_STRUCT_DATAPTR ss, sint length) { static char title[MAXLINE+1]; static char line[MAXLINE+1]; static char lin2[MAXLINE+1]; static char sname[MAXNAMES+1]; sint i, len, offset = 0; unsigned char c; for (;;) { line[0] = '\0'; /* search for the next comment line */ while(*line != '"') if (fgets(line,MAXLINE+1,fin) == NULL) return; /* is it a secondary structure entry? */ if (strncmp(&line[1],"SS_",3) == 0) { for (i=1;i<=MAXNAMES-3;i++) { if (line[i+3] == '(' || line[i+3] == '\n') break; sname[i-1] = line[i+3]; } i--; sname[i]=EOS; if (sname[i-1] == '(') sscanf(&line[i+3],"%d",&offset); else offset = 0; for(i--;i > 0;i--) if(isspace(sname[i])) { sname[i]=EOS; } else break; blank_to_(sname); strcpy(title,"Found secondary structure in alignment file: "); strcat(title,sname); (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); if ((*lin2 != 'n') && (*lin2 != 'N')) { ss->struct_penalties = SECST; for (i=0;isec_struct_mask[i] = '.'; len = 0; while(fgets(line,MAXLINE+1,fin)) { if(*line == '%' || *line == '#' || *line == '"') break; for(i=offset;i < length;i++) { c=line[i]; if(c == '\n' || c == EOS) break; /* EOL */ ss->sec_struct_mask[len++]=c; } if (len > length) break; } strcpy(ss->name,sname); } } /* or is it a gap penalty mask entry? */ else if (strncmp(&line[1],"GM_",3) == 0) { for (i=1;i<=MAXNAMES-3;i++) { if (line[i+3] == '(' || line[i+3] == '\n') break; sname[i-1] = line[i+3]; } i--; sname[i]=EOS; if (sname[i-1] == '(') sscanf(&line[i+3],"%d",&offset); else offset = 0; for(i--;i > 0;i--) if(isspace(sname[i])) { sname[i]=EOS; } else break; blank_to_(sname); strcpy(title,"Found gap penalty mask in alignment file: "); strcat(title,sname); (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); if ((*lin2 != 'n') && (*lin2 != 'N')) { ss->struct_penalties = GMASK; for (i=0;igap_penalty_mask[i] = '1'; len = 0; while(fgets(line,MAXLINE+1,fin)) { if(*line == '%' || *line == '#' || *line == '"') break; for(i=offset;i < length;i++) { c=line[i]; if(c == '\n' || c == EOS) break; /* EOL */ ss->gap_penalty_mask[len++]=c; } if (len > length) break; } strcpy(ss->name,sname); } } if (ss->struct_penalties != NONE) break; } } static void get_swiss_feature(char *line,sint len,SEC_STRUCT_DATAPTR ss) { char c, s, feature[MAXLINE+1]; int i, start_pos, end_pos; if (sscanf(line,"%s%d%d",feature,&start_pos,&end_pos) != 3) { return; } if (strcmp(feature,"HELIX") == 0) { c = 'A'; s = '$'; } else if (strcmp(feature,"STRAND") == 0) { c = 'B'; s = '%'; } else return; if(start_pos >= len || end_pos >= len) return; ss->sec_struct_mask[start_pos-1] = s; for (i=start_pos;isec_struct_mask[i] = c; ss->sec_struct_mask[end_pos-1] = s; } static void get_rsf_feature(char *line,sint len,SEC_STRUCT_DATAPTR ss) { char c, s; char str1[MAXLINE+1],str2[MAXLINE+1],feature[MAXLINE+1]; int i, tmp,start_pos, end_pos; if (sscanf(line,"%d%d%d%s%s%s",&start_pos,&end_pos,&tmp,str1,str2,feature) != 6) { return; } if (strcmp(feature,"HELIX") == 0) { c = 'A'; s = '$'; } else if (strcmp(feature,"STRAND") == 0) { c = 'B'; s = '%'; } else return; if(start_pos >= len || end_pos >= len) return; ss->sec_struct_mask[start_pos-1] = s; for (i=start_pos;isec_struct_mask[i] = c; ss->sec_struct_mask[end_pos-1] = s; } static void get_swiss_mask(char *line,sint len,SEC_STRUCT_DATAPTR ss) { int i, value, start_pos, end_pos; if (sscanf(line,"%d%d%d",&value,&start_pos,&end_pos) != 3) { return; } if (value < 1 || value > 9) return; if(start_pos >= len || end_pos >= len) return; for (i=start_pos-1;igap_penalty_mask[i] = value+'0'; } static char * get_seq(char *sname,sint *len,char *tit,Boolean dnaflag) { static char line[MAXLINE+1]; char *seq = NULL; sint i, offset = 0; unsigned char c=EOS; Boolean got_seq=FALSE; switch(seqFormat) { /************************************/ case EMBLSWISS: while( !linetype(line,"ID") ) if (fgets(line,MAXLINE+1,fin) == NULL) return NULL; for(i=5;i<=strlen(line);i++) /* DES */ if(line[i] != ' ') break; strncpy(sname,line+i,MAXNAMES); /* remember entryname */ for(i=0;i<=strlen(sname);i++) if(sname[i] == ' ') { sname[i]=EOS; break; } sname[MAXNAMES]=EOS; rtrim(sname); blank_to_(sname); while( !linetype(line,"SQ") ) fgets(line,MAXLINE+1,fin); *len=0; while(fgets(line,MAXLINE+1,fin)) { if(got_seq && blankline(line)) break; if( strlen(line) > 2 && line[strlen(line)-2]=='.' && line[strlen(line)-3]=='.' ) continue; if(seq==NULL) seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); else seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); for(i=0;i') break; /* EOL */ if(isalpha(c)||c==GAP2) seq[(*len)++]=c; } if(c == '>') break; } break; /**********************************************/ case GDE: if (dnaflag) { while(*line != '#') fgets(line,MAXLINE+1,fin); } else { while(*line != '%') fgets(line,MAXLINE+1,fin); } for (i=1;i<=MAXNAMES;i++) { if (line[i] == '(' || line[i] == '\n') break; sname[i-1] = line[i]; } i--; sname[i]=EOS; if (sname[i-1] == '(') sscanf(&line[i],"%d",&offset); else offset = 0; for(i--;i > 0;i--) if(isspace(sname[i])) { sname[i]=EOS; } else break; blank_to_(sname); *tit=EOS; *len=0; for (i=0;i file not found */ } no_seqs=0; check_infile(&no_seqs,&mult_aln->dnaflag); info("Sequence format is %s",formatNames[seqFormat]); /* DES DEBUG fprintf(stdout,"\n\n File name = %s\n\n",filename); */ if(no_seqs == 0) return 0; /* return the number of seqs. (zero here)*/ /* if this is a multiple alignment, or profile 1 - free any memory used by previous alignments, then allocate memory for the new alignment */ if(first_seq == 0) { free_aln(mult_aln); alloc_aln(no_seqs,mult_aln); } /* otherwise, this is a profile 2, and we need to reallocate the arrays, leaving the data for profile 1 intact */ else { realloc_aln(first_seq,no_seqs,mult_aln); } for(i=first_seq;iseqs[i].output_index=i; if(seqFormat == CLUSTAL) seq1=get_clustal_seq(sname1,&l1,i-first_seq); else if(seqFormat == MSF) seq1=get_msf_seq(sname1,&l1,i-first_seq); else seq1=get_seq(sname1,&l1,title,mult_aln->dnaflag); if(seq1==NULL) break; /* JULIE */ /* Set max length of dynamically allocated arrays in prfalign.c */ if (l1 > max_aln_length) max_aln_length = l1; mult_aln->seqs[i].len=l1; /* store the length */ strcpy(mult_aln->seqs[i].name,sname1); /* " " name */ strcpy(mult_aln->seqs[i].title,title); /* " " title */ if(explicit_type==0) { dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */ if(i == 1) mult_aln->dnaflag = dnaflag1; } /* type decided by first seq*/ else if(explicit_type==1) dnaflag1 = TRUE; else dnaflag1 = FALSE; alloc_seq(&mult_aln->seqs[i],l1); for(j=0;jseqs[i].data[j]=tolower(seq1[j]); else mult_aln->seqs[i].data[j]=GAP2; } if(seq1!=NULL) seq1=ckfree(seq1); } max_aln_length *= 2; /* JULIE check sequence names are all different - otherwise phylip tree is confused. */ for(i=0;iseqs[i].name,mult_aln->seqs[j].name,MAXNAMES) == 0) { error("Multiple sequences found with same name, %s (first %d chars are significant)", mult_aln->seqs[i].name,MAXNAMES); return -1; } } } for(i=first_seq;iseqs[i].len>max_aln_length) max_aln_length=mult_aln->seqs[i].len; } /* look for a feature table / gap penalty mask (only if this is a profile) */ if (prf_no > 0) { rewind(fin); if(prf_no==1) ss=(&mult_aln->prf1.ss); else ss=(&mult_aln->prf2.ss); free_ss(ss); ss->struct_penalties = NONE; ss->gap_penalty_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char)); ss->sec_struct_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char)); ss->name = (char *)ckalloc((MAXNAMES+1) * sizeof (char)); if (seqFormat == CLUSTAL) { get_clustal_ss(ss,max_aln_length); } else if (seqFormat == GDE) { get_gde_ss(ss,max_aln_length); } else if (seqFormat == EMBLSWISS) { get_embl_ss(ss,max_aln_length); } else if (seqFormat == RSF) { get_rsf_ss(ss,max_aln_length); } if(ss->struct_penalties == NONE) ss->gap_penalty_mask=(char *)ckfree((void *)ss->gap_penalty_mask); } fclose(fin); return no_seqs; /* return the number of seqs. read in this call */ } static free_ss(SEC_STRUCT_DATAPTR ss) { ss->struct_penalties = NONE; if (ss->sec_struct_mask != NULL) ss->sec_struct_mask=ckfree(ss->sec_struct_mask); if (ss->gap_penalty_mask != NULL) ss->gap_penalty_mask=ckfree(ss->gap_penalty_mask); if (ss->name != NULL) ss->name=ckfree(ss->name); } static Boolean check_dnaflag(char *seq, sint slen) /* check if DNA or Protein The decision is based on counting all A,C,G,T,U or N. If >= 85% of all characters (except -) are as above => DNA */ { sint i, c, nresidues, nbases; float ratio; char *dna_codes="ACGTUN"; nresidues = nbases = 0; for(i=0; i < slen; i++) { if(seq[i] != GAP2) { nresidues++; if(seq[i] == 'N') nbases++; else { c = res_index(dna_codes, seq[i]); if(c >= 0) nbases++; } } } if( (nbases == 0) || (nresidues == 0) ) return FALSE; ratio = (float)nbases/(float)nresidues; /* DES fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n", (pint)nbases,(pint)nresidues,(pint)ratio); */ if(ratio >= 0.85) return TRUE; else return FALSE; } static void check_infile(sint *nseqs,Boolean *dnaflag) { char line[MAXLINE+1]; sint i; *nseqs=0; while (fgets(line,MAXLINE+1,fin) != NULL) { if(!blankline(line)) break; } for(i=strlen(line)-1;i>=0;i--) if(isgraph(line[i])) break; line[i+1]=EOS; for(i=0;i<=6;i++) line[i] = toupper(line[i]); if( linetype(line,"ID") ) { /* EMBL/Swiss-Prot format ? */ seqFormat=EMBLSWISS; (*nseqs)++; } else if( linetype(line,"CLUSTAL") ) { seqFormat=CLUSTAL; } else if( linetype(line,"PILEUP") ) { seqFormat = MSF; } else if( linetype(line,"!!AA_MULTIPLE_ALIGNMENT") ) { seqFormat = MSF; *dnaflag = FALSE; } else if( linetype(line,"!!NA_MULTIPLE_ALIGNMENT") ) { seqFormat = MSF; *dnaflag = TRUE; } else if( strstr(line,"MSF") && line[strlen(line)-1]=='.' && line[strlen(line)-2]=='.' ) { seqFormat = MSF; } else if( linetype(line,"!!RICH_SEQUENCE") ) { seqFormat = RSF; } else if(*line == '>') { /* no */ seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */ (*nseqs)++; } else if((*line == '"') || (*line == '%') || (*line == '#')) { seqFormat=GDE; /* GDE format */ if (*line == '%') { (*nseqs)++; *dnaflag = FALSE; } else if (*line == '#') { (*nseqs)++; *dnaflag = TRUE; } } else { seqFormat=UNKNOWN; return; } while(fgets(line,MAXLINE+1,fin) != NULL) { switch(seqFormat) { case EMBLSWISS: if( linetype(line,"ID") ) (*nseqs)++; break; case PIR: *nseqs = count_pir_seqs(); fseek(fin,0,0); return; case PEARSON: if( *line == '>' ) (*nseqs)++; break; case GDE: if(( *line == '%' ) && ( *dnaflag == FALSE)) (*nseqs)++; else if (( *line == '#') && ( *dnaflag == TRUE)) (*nseqs)++; break; case CLUSTAL: *nseqs = count_clustal_seqs(); /* DES */ /* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */ fseek(fin,0,0); return; case MSF: *nseqs = count_msf_seqs(); fseek(fin,0,0); return; case RSF: fseek(fin,0,0); *nseqs = count_rsf_seqs(); fseek(fin,0,0); return; case USER: default: break; } } fseek(fin,0,0); } static sint count_pir_seqs(void) /* count the number of sequences in a pir alignment file */ { char line[MAXLINE+1],c; sint nseqs, i; Boolean seq_ok; seq_ok = FALSE; while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of first seq */ if(*line == '>') break; for(i=0;seq_ok == FALSE;i++) { c=line[i]; if(c == '*') { seq_ok = TRUE; /* ok - end of sequence found */ break; } /* EOL */ if(c == '\n' || c == EOS) break; /* EOL */ } if (seq_ok == TRUE) break; } if (seq_ok == FALSE) { error("PIR format sequence end marker '*'\nmissing for one or more sequences."); return (sint)0; /* funny format*/ } nseqs = 1; while (fgets(line,MAXLINE+1,fin) != NULL) { if(*line == '>') { /* Look for start of next seq */ seq_ok = FALSE; while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of seq */ if(*line == '>') { error("PIR format sequence end marker '*' missing for one or more sequences."); return (sint)0; /* funny format*/ } for(i=0;seq_ok == FALSE;i++) { c=line[i]; if(c == '*') { seq_ok = TRUE; /* ok - sequence found */ break; } /* EOL */ if(c == '\n' || c == EOS) break; /* EOL */ } if (seq_ok == TRUE) { nseqs++; break; } } } } return (sint)nseqs; } static sint count_clustal_seqs(void) /* count the number of sequences in a clustal alignment file */ { char line[MAXLINE+1]; sint nseqs; while (fgets(line,MAXLINE+1,fin) != NULL) { if(!cl_blankline(line)) break; /* Look for next non- */ } /* blank line */ nseqs = 1; while (fgets(line,MAXLINE+1,fin) != NULL) { if(cl_blankline(line)) return nseqs; nseqs++; } return (sint)0; /* if you got to here-funny format/no seqs.*/ } static sint count_msf_seqs(void) { /* count the number of sequences in a PILEUP alignment file */ char line[MAXLINE+1]; sint nseqs; while (fgets(line,MAXLINE+1,fin) != NULL) { if(linetype(line,"//")) break; } while (fgets(line,MAXLINE+1,fin) != NULL) { if(!blankline(line)) break; /* Look for next non- */ } /* blank line */ nseqs = 1; while (fgets(line,MAXLINE+1,fin) != NULL) { if(blankline(line)) return nseqs; nseqs++; } return (sint)0; /* if you got to here-funny format/no seqs.*/ } static sint count_rsf_seqs(void) { /* count the number of sequences in a GCG RSF alignment file */ char line[MAXLINE+1]; sint nseqs; nseqs = 0; /* skip the comments */ while (fgets(line,MAXLINE+1,fin) != NULL) { if(line[strlen(line)-2]=='.' && line[strlen(line)-3]=='.') break; } while (fgets(line,MAXLINE+1,fin) != NULL) { if( *line == '{' ) nseqs++; } return (sint)nseqs; } static sint res_index(char *t,char c) { register sint i; for(i=0;t[i] && t[i] != c;i++) ; if(t[i]) return(i); else return -1; } sint seq_input(char *filename,sint explicit_type,Boolean append,ALNPTR mult_aln) { sint i; sint local_nseqs; sint max_names; mult_aln->dnaflag=FALSE; strcpy(mult_aln->filename,""); mult_aln->prf1.nseqs=0; strcpy(mult_aln->prf1.filename,""); mult_aln->prf1.ss.struct_penalties=FALSE; mult_aln->prf2.nseqs=0; strcpy(mult_aln->prf2.filename,""); mult_aln->prf2.ss.struct_penalties=FALSE; if (append) local_nseqs = readseqs(filename,(sint)0,mult_aln->nseqs,explicit_type,mult_aln); else { mult_aln->nseqs=0; local_nseqs = readseqs(filename,(sint)0,(sint)0,explicit_type,mult_aln); } if(local_nseqs < 0) /* file could not be opened */ { return local_nseqs; } else if(local_nseqs == 0) /* no sequences */ { error("No sequences in file! Bad format?"); return local_nseqs; } else { strcpy(mult_aln->filename,filename); if(append) mult_aln->nseqs+=local_nseqs; else mult_aln->nseqs=local_nseqs; info("Sequences assumed to be %s", mult_aln->dnaflag?"DNA":"PROTEIN"); info("\n\n"); max_names=0; for(i=0; inseqs; i++) { if(strlen(mult_aln->seqs[i].name)>max_names) max_names=strlen(mult_aln->seqs[i].name); } if(max_names<10) max_names=10; for(i=0; inseqs; i++) { info("Sequence %d: %-*s %6.d %s", (pint)i+1,max_names,mult_aln->seqs[i].name,(pint)mult_aln->seqs[i].len,mult_aln->dnaflag?"bp":"aa"); } } return local_nseqs; } sint profile_input(char *filename,sint prf_no,sint explicit_type,ALNPTR mult_aln) /* read a profile */ { /* prf_no is 1 or 2 */ sint local_nseqs, i; sint max_names; if(prf_no == 2 && mult_aln->prf1.nseqs<=0) { error("You must read in profile number 1 first"); return 0; } mult_aln->dnaflag=FALSE; mult_aln->nseqs=0; strcpy(mult_aln->filename,""); if(prf_no == 1) /* for the 1st profile */ { mult_aln->prf1.nseqs=0; strcpy(mult_aln->prf1.filename,""); mult_aln->prf1.ss.struct_penalties=FALSE; mult_aln->prf2.nseqs=0; strcpy(mult_aln->prf2.filename,""); mult_aln->prf2.ss.struct_penalties=FALSE; local_nseqs = readseqs(filename,(sint)1,(sint)0,explicit_type,mult_aln); /* (1) means 1st seq to be read = no. 1 */ if(local_nseqs == 0) /* no sequences */ { error("No sequences in file! Bad format?"); } else if (local_nseqs > 0) { /* success; found some seqs. */ strcpy(mult_aln->filename,filename); strcpy(mult_aln->prf1.filename,filename); mult_aln->nseqs = mult_aln->prf1.nseqs = local_nseqs; info("No. of seqs=%d",(pint)mult_aln->nseqs); } } else { /* first seq to be read = profile1_nseqs + 1 */ mult_aln->prf2.nseqs=0; strcpy(mult_aln->prf2.filename,""); mult_aln->prf2.ss.struct_penalties=FALSE; local_nseqs = readseqs(filename,(sint)2,mult_aln->prf1.nseqs,explicit_type,mult_aln); if(local_nseqs == 0) /* no sequences */ { error("No sequences in file! Bad format?"); } else if(local_nseqs > 0) { info("No. of seqs in profile=%d",(pint)local_nseqs); strcpy(mult_aln->filename,filename); strcpy(mult_aln->prf2.filename,filename); mult_aln->nseqs = mult_aln->prf1.nseqs + local_nseqs; mult_aln->prf2.nseqs = local_nseqs; info("Total no. of seqs =%d",(pint)mult_aln->nseqs); } } if(local_nseqs<=0) return local_nseqs; info("Sequences assumed to be %s", mult_aln->dnaflag?"DNA":"PROTEIN"); info("\n\n"); max_names=0; for(i=(mult_aln->prf2.nseqs==0)?0:mult_aln->prf1.nseqs; inseqs; i++) { if(strlen(mult_aln->seqs[i].name)>max_names) max_names=strlen(mult_aln->seqs[i].name); } if(max_names<10) max_names=10; for(i=(mult_aln->prf2.nseqs==0)?0:mult_aln->prf1.nseqs; inseqs; i++) { info("Sequence %d: %-*s %6.d %s", (pint)i+1,max_names,mult_aln->seqs[i].name,(pint)mult_aln->seqs[i].len,mult_aln->dnaflag?"bp":"aa"); } return local_nseqs; }