[Seqinr-commits] r2140 - in pkg: . src
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Wed Feb 16 10:59:13 CET 2022
Author: simonpenel
Date: 2022-02-16 10:59:13 +0100 (Wed, 16 Feb 2022)
New Revision: 2140
Modified:
pkg/DESCRIPTION
pkg/src/alignment.c
Log:
Fixing bug spotted by Danny Arends: gap option in dist.alignment is now working for protein sequences
Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION 2022-01-03 13:15:10 UTC (rev 2139)
+++ pkg/DESCRIPTION 2022-02-16 09:59:13 UTC (rev 2140)
@@ -1,7 +1,7 @@
Encoding: UTF-8
Package: seqinr
-Version: 4.2-11
-Date: 2022-01-03
+Version: 4.2-12
+Date: 2022-02-16
Title: Biological Sequences Retrieval and Analysis
Authors at R: c(person("Delphine", "Charif", role = "aut"),
person("Olivier", "Clerc", role = "ctb"),
Modified: pkg/src/alignment.c
===================================================================
--- pkg/src/alignment.c 2022-01-03 13:15:10 UTC (rev 2139)
+++ pkg/src/alignment.c 2022-02-16 09:59:13 UTC (rev 2140)
@@ -13,13 +13,13 @@
string[ii] == ' ' || string[ii] == '\t') string[ii] = 0;
else break;
}
-
+
}
/**************************** end rem_blank ************************/
void free_mase(struct SEQMASE * aln, int nbsq)
-
+
{
int ii;
@@ -28,10 +28,10 @@
free(aln[ii].seq);
free(aln[ii].com);
}
-
+
free((char *) aln);
-
-
+
+
}
/******************************** end free_mase ************************/
@@ -43,7 +43,7 @@
char **seq, **seqname, **comments;
if(count1 == -1) max_count = 0;
-
+
if(count1 + 1 < max_count) return count1 + 1;
count1++;
@@ -66,7 +66,7 @@
comments = (char **)realloc(comments, max_count * sizeof(char *));
if(comments == NULL) return -1;
}
-
+
*pseq = seq; *pseqname = seqname; *pcomments = comments;
return count1;
}
@@ -74,7 +74,7 @@
/******************************** end one_more_seq_found ************************/
/***********************************************************************************************************************/
-/* lit un fichier MASE, renvoie une liste (objet R) contenant les séquences, les commentaies et les noms des espèces. */
+/* lit un fichier MASE, renvoie une liste (objet R) contenant les s�quences, les commentaies et les noms des esp�ces. */
/***********************************************************************************************************************/
@@ -88,43 +88,43 @@
char string[MAXSTRING + 1];
char c1, c2;
int i,ii, jj, kk = 0, numline, maxcom = 0;
-
+
SEXP listseq;
SEXP essai;
SEXP listcom;
SEXP listmn;
SEXP nombreseq;
-
- /*Passages des objets R (paramètres) dans des variables C */
- fic_name = (char *) CHAR(STRING_ELT(nomfic, 0));
+ /*Passages des objets R (param�tres) dans des variables C */
+ fic_name = (char *) CHAR(STRING_ELT(nomfic, 0));
+
if((fic = fopen(fic_name, "r")) == NULL) {
error("Can't open file");
}
-
+
c1 = 0;
nb_seq = 0;
lg = lgc = 0;
while(fgets(string, MAXSTRING, fic) != NULL) {
string[MAXSTRING] = 0;
-
+
lgs = strlen(string);
-
+
if(lgs >= (MAXSTRING - 1)) {
REprintf("\n Fatal Error. Too long line in alignment (> %d).\n", MAXSTRING);
- REprintf("Increase MAXSTRING and recompile.\n");
+ REprintf("Increase MAXSTRING and recompile.\n");
}
-
+
c2 = string[0];
-
+
if(string[0] == ';' && string[1] != ';') {
lgc += (lgs + 1);
}
-
-
+
+
if(c1 == ';' && c2 != c1) {
nb_seq++;
if(lg > lg_max) lg_max = lg;
@@ -131,16 +131,16 @@
if(lgc > maxcom) maxcom = lgc;
lg = lgc = 0;
}
-
+
else if(c2 != ';') lg += lgs;
c1 = c2;
-
+
}
if(lg > lg_max) lg_max = lg;
-
+
/******************************************/
- /* Création de 6 objets R qui seront */
+ /* Cr�ation de 6 objets R qui seront */
/******************************************/
PROTECT(listseq=allocVector(VECSXP,nb_seq));
@@ -148,19 +148,19 @@
PROTECT(listcom=allocVector(VECSXP,nb_seq));
PROTECT(listmn=allocVector(VECSXP,nb_seq));
PROTECT(nombreseq=NEW_INTEGER(1));
-
-
-
+
+
+
aln = (struct SEQMASE *) calloc(nb_seq + 1, sizeof(struct SEQMASE));
-
+
for(ii = 0; ii <= nb_seq; ii++) {
aln[ii].seq = (char *) calloc(lg_max + 1, sizeof(char));
aln[ii].com = (char *) calloc(maxcom + 1, sizeof(char));
aln[ii].com[0] = 0;
}
-
+
rewind(fic);
numline = 0;
@@ -173,29 +173,29 @@
error("Not a MASE file"); /* check format, thanks to J.H. Troesemeier */
goto fini;
}
-
+
c2 = string[0];
-
+
if(string[0] == ';' && string[1] != ';') {
strcat(aln[ii + 1].com, string);
}
-
+
if(c1 == ';' && c2 != c1) {
ii++;
kk = aln[ii].lg = 0;
-
+
rem_blank(string);
if((int) strlen(string) >= (MAXMNMASE - 1)) {
REprintf("Error. Maximum sequance name is %d characters\n", MAXMNMASE);
error("sequence name too long!");
- }
-
+ }
+
strcpy(aln[ii].mn, string);
-
+
lg = 0;
}
-
+
else if(c2 != ';') {
for(jj = 0; jj < MAXSTRING; jj++) {
if(string[jj] == 0) break;
@@ -207,20 +207,20 @@
}
}
c1 = c2;
-
+
}
-
+
fclose(fic);
-
+
lg_max = aln[0].lg;
for(ii = 1; ii < nb_seq; ii++)
if(aln[ii].lg > lg_max) lg_max = aln[ii].lg;
-
+
INTEGER(nombreseq)[0]=(int)nb_seq;
-
+
for(i=0;i<nb_seq;i++){
SET_ELEMENT(listseq,i,mkChar(aln[i].seq));
}
@@ -232,16 +232,16 @@
for(i=0;i<nb_seq;i++){
SET_ELEMENT(listmn,i,mkChar(aln[i].mn));
}
-
+
SET_ELEMENT(essai,0,nombreseq);
SET_ELEMENT(essai,1,listmn);
SET_ELEMENT(essai,2,listseq);
SET_ELEMENT(essai,3,listcom);
-
- fini:
+
+ fini:
free_mase(aln,nb_seq);
UNPROTECT(5);
-
+
return(essai);
}
@@ -249,9 +249,9 @@
/********************** end read_mase ****************************/
-
+
/*************************************************************************/
/* Compute distance between two aligned sequences using different matrix */
/*************************************************************************/
@@ -292,10 +292,10 @@
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1/3, 1/3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1/3, 1/3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1/3} };
-
-
-
+
+
+
MAXNSEQS = INTEGER_VALUE(nbseq);
totseqs = INTEGER_VALUE(nbseq);
mat_number= INTEGER_VALUE(matNumber);
@@ -305,142 +305,112 @@
PROTECT(d=NEW_NUMERIC(totseqs*totseqs));
seq = (char **)malloc(totseqs*sizeof(char *));
-
+
for(i=0;i<totseqs;i++){
seq[i] = (char *) CHAR(STRING_ELT(sequences,i));
}
ndiff = (int **)malloc(MAXNSEQS*sizeof(int *));
-
- for(j=0; j < MAXNSEQS; j++){
+
+ for(j=0; j < MAXNSEQS; j++){
ndiff[j] = (int *)malloc(MAXNSEQS*sizeof(int));
}
-
+
dist = (double **)malloc(MAXNSEQS*sizeof(double *));
-
- for(j=0; j < MAXNSEQS; j++){
+
+ for(j=0; j < MAXNSEQS; j++){
dist[j] = (double *)malloc(MAXNSEQS*sizeof(double));
}
-
-
-
+
+
+
/*********************************************************/
/* Computing distance between sequences i and j */
/*********************************************************/
-
- seq_long = (int)strlen(seq[0]);
-
- for (i = 0; i < totseqs; i++)
- {
- dist[i][i] = 0.0;
- }
- for (i = 0; i < totseqs; i++)
- {
- for (j = 0; j < i; j++)
- {
- ndiff[j][i] = ndiff[i][j] = 0;
- nbases = 0;
- for (k = 0; k < seq_long; k++)
- {
-
- if(seq_type == 1){
-
- /***************************/
- /* DNA/RNA sequences */
- /***************************/
- if ((strchr(DNA, seq[i][k]) == NULL) || (strchr(DNA, seq[j][k]) == NULL))
- seq[i][k] = seq[j][k] = 'X';
- if (seq[i][k] != '-' && seq[i][k] != 'N' && seq[i][k] != 'X' &&
- seq[j][k] != '-' && seq[j][k] != 'N' && seq[j][k] != 'X')
- {
- nbases++;
- if (seq[i][k] != seq[j][k])
- {
- ndiff[j][i]++;
- ndiff[i][j]++;
+ seq_long = (int)strlen(seq[0]);
+
+ for (i = 0; i < totseqs; i++) {
+ dist[i][i] = 0.0;
+ }
+ for (i = 0; i < totseqs; i++) {
+ for (j = 0; j < i; j++) {
+ ndiff[j][i] = ndiff[i][j] = 0;
+ nbases = 0;
+ for (k = 0; k < seq_long; k++) {
+ if (seq_type == 1) {
+ /***************************/
+ /* DNA/RNA sequences */
+ /***************************/
+ if ((strchr(DNA, seq[i][k]) == NULL) || (strchr(DNA, seq[j][k]) == NULL))
+ seq[i][k] = seq[j][k] = 'X';
+ if (seq[i][k] != '-' && seq[i][k] != 'N' && seq[i][k] != 'X' &&
+ seq[j][k] != '-' && seq[j][k] != 'N' && seq[j][k] != 'X') {
+ nbases++;
+ if (seq[i][k] != seq[j][k]) {
+ ndiff[j][i]++;
+ ndiff[i][j]++;
+ }
}
- }
-
- if (gap_option == 1)
- {
- if (seq[i][k] == '-' && seq[j][k] != '-' && seq[j][k] != 'N' && seq[j][k] != 'X')
- {
- nbases++;
- ndiff[j][i]++;
- ndiff[i][j]++;
- }
- if (seq[j][k] == '-' && seq[i][k] != '-' && seq[i][k] != 'N' && seq[i][k] != 'X')
- {
- nbases++;
- ndiff[j][i]++;
- ndiff[i][j]++;
- }
-
- }
}
-
else if(mat_number == 1 && seq_type == 0){
-
- /********************************************/
- /*Protein sequences with similarity matrix */
- /********************************************/
-
- if ((strchr(Prot, seq[i][k]) == NULL) || (strchr(Prot, seq[j][k]) == NULL))
- seq[i][k] = seq[j][k] = 'X';
- if (seq[i][k] != '-' && seq[i][k] != '*' && seq[i][k] != 'X' &&
- seq[j][k] != '-' && seq[j][k] != '*' && seq[j][k] != 'X')
- {
- nbases++;
- ndiff[i][j] = ndiff[i][j] + matp[mat_pos[seq[i][k] - 65]][mat_pos[seq[j][k] - 65]];
- ndiff[j][i] = ndiff[i][j];
-
- }
+ /********************************************/
+ /*Protein sequences with similarity matrix */
+ /********************************************/
+ if ((strchr(Prot, seq[i][k]) == NULL) || (strchr(Prot, seq[j][k]) == NULL))
+ seq[i][k] = seq[j][k] = 'X';
+ if (seq[i][k] != '-' && seq[i][k] != '*' && seq[i][k] != 'X' &&
+ seq[j][k] != '-' && seq[j][k] != '*' && seq[j][k] != 'X') {
+ nbases++;
+ ndiff[i][j] = ndiff[i][j] + matp[mat_pos[seq[i][k] - 65]][mat_pos[seq[j][k] - 65]];
+ ndiff[j][i] = ndiff[i][j];
+ }
}
-
else if( mat_number == 2 && seq_type == 0){
-
- /********************************************/
- /* Protein sequences with identity matrix */
- /********************************************/
-
- if ((strchr(Prot, seq[i][k]) == NULL) || (strchr(Prot, seq[j][k]) == NULL))
- seq[i][k] = seq[j][k] = 'X';
- if (seq[i][k] != '-' && seq[i][k] != '*' && seq[i][k] != 'X' &&
- seq[j][k] != '-' && seq[j][k] != '*' && seq[j][k] != 'X')
- {
- nbases++;
- if (seq[i][k] != seq[j][k])
- {
- ndiff[j][i]++;
- ndiff[i][j]++;
- }
- }
-
+ /********************************************/
+ /* Protein sequences with identity matrix */
+ /********************************************/
+ if ((strchr(Prot, seq[i][k]) == NULL) || (strchr(Prot, seq[j][k]) == NULL))
+ seq[i][k] = seq[j][k] = 'X';
+ if (seq[i][k] != '-' && seq[i][k] != '*' && seq[i][k] != 'X' &&
+ seq[j][k] != '-' && seq[j][k] != '*' && seq[j][k] != 'X') {
+ nbases++;
+ if (seq[i][k] != seq[j][k]) {
+ ndiff[j][i]++;
+ ndiff[i][j]++;
+ }
+ }
+ if (gap_option == 1) {
+ if (seq[i][k] == '-' && seq[j][k] != '-' && seq[j][k] != '*' && seq[j][k] != 'X') {
+ nbases++;
+ ndiff[j][i]++;
+ ndiff[i][j]++;
+ }
+ if (seq[j][k] == '-' && seq[i][k] != '-' && seq[i][k] != '*' && seq[i][k] != 'X') {
+ nbases++;
+ ndiff[j][i]++;
+ ndiff[i][j]++;
+ }
+ }
}
-
- dist[i][j] = dist[j][i] = sqrt((double)ndiff[i][j]/nbases);
+ dist[i][j] = dist[j][i] = sqrt((double)ndiff[i][j]/nbases);
}
- }
- }
-
-
- /********************************************************************************/
- /* Remplissage de l'objet R (matrice de taille nb_seq * nb_seq avec dist */
- /********************************************************************************/
+ }
+ }
- n=0;
-
- for(i=0;i<totseqs;i++){
- for(j=0;j<totseqs;j++){
- REAL(d)[n+j]=dist[i][j];
- }
- n=n+totseqs;
+
+ /********************************************************************************/
+ /* Remplissage de l'objet R (matrice de taille nb_seq * nb_seq avec dist */
+ /********************************************************************************/
+ n=0;
+ for(i=0;i<totseqs;i++){
+ for(j=0;j<totseqs;j++){
+ REAL(d)[n+j]=dist[i][j];
}
-
- UNPROTECT(1);
-
- return(d);
+ n=n+totseqs;
+ }
+ UNPROTECT(1);
+ return(d);
}
/****************************************/
@@ -455,13 +425,13 @@
SEXP listname;
SEXP nombreseq;
char *fname;
- FILE *in;
+ FILE *in;
char line[100], *p, *q;
int i,l, curr_spec, maxwidname=0, curr_len, tot_spec, wid_1_line, wid_block;
char **seq, **seqname, **comments;
-
- fname = (char *) CHAR(STRING_ELT(ficname,0));
+ fname = (char *) CHAR(STRING_ELT(ficname,0));
+
PROTECT(nombreseq=NEW_INTEGER(1));
PROTECT(list=allocVector(VECSXP,3));
@@ -469,7 +439,7 @@
if(in==NULL) {
error("File not found");
}
-
+
/* compter le nbre de seqs dans le fichier */
tot_spec = 0;
while(fgets(line, sizeof(line), in) != NULL) {
@@ -477,12 +447,12 @@
if(strncmp(line, " Name: ", 7) == 0) tot_spec++;
}
rewind(in);
-
+
INTEGER(nombreseq)[0]=tot_spec;
-
+
PROTECT(listname=allocVector(VECSXP,tot_spec));
PROTECT(listseq=allocVector(VECSXP,tot_spec));
-
+
seq = (char **)malloc(tot_spec * sizeof(char *));
if(seq == NULL) goto nomem;
comments = (char **)malloc(tot_spec * sizeof(char *));
@@ -489,7 +459,7 @@
if(comments == NULL) goto nomem;
seqname = (char **)malloc(tot_spec * sizeof(char *));
if(seqname == NULL) goto nomem;
-
+
p = NULL;
while( fgets(line,sizeof(line),in) != NULL) {
if( (p = strstr(line, "MSF: ")) != NULL) break;
@@ -501,11 +471,11 @@
tot_spec = -1;
do {
if(!fgets(line,sizeof(line),in))
- break;
+ break;
if( (p = strstr(line, "Name:") ) == NULL) continue;
tot_spec++;
- q = strstr(p, " Len: ");
+ q = strstr(p, " Len: ");
sscanf(q + 5, "%d", &l);
seq[tot_spec] = (char *)malloc(l + 1);
if(seq[tot_spec]==NULL) goto nomem;
@@ -541,17 +511,17 @@
}
else curr_spec++;
}
-
+
for(i=0; i<tot_spec+1; i++) {
SET_ELEMENT(listname,i,mkChar(seqname[i]));
SET_ELEMENT(listseq,i,mkChar(seq[i]));
}
-
+
SET_ELEMENT(list,0,nombreseq);
SET_ELEMENT(list,1,listname);
SET_ELEMENT(list,2,listseq);
-
-
+
+
fini:
fclose(in);
UNPROTECT(4);
@@ -580,19 +550,19 @@
char *p, *q, line[PHYNAME + 200];
char **seq, **comments, **seqname;
int totseqs, lenseqs, i, l;
-
+
q=0;
- fname = (char *) CHAR(STRING_ELT(ficname,0));
-
+ fname = (char *) CHAR(STRING_ELT(ficname,0));
+
PROTECT(nombreseq=NEW_INTEGER(1));
PROTECT(list=allocVector(VECSXP,3));
-
-
+
+
in=fopen(fname,"r");
if(in==NULL) {
error("file not found");
}
-
+
if(!fgets(line,sizeof(line),in) ||
sscanf(line, "%d%d", &totseqs, &lenseqs) != 2) {
error("Not a PHYLIP file");
@@ -604,7 +574,7 @@
PROTECT(listname=allocVector(VECSXP,totseqs));
PROTECT(listseq=allocVector(VECSXP,totseqs));
-
+
seq = (char **)malloc(totseqs * sizeof(char *));
if(seq == NULL) goto nomem;
seqname = (char **)malloc(totseqs * sizeof(char *));
@@ -684,7 +654,7 @@
char line[200], *p, *i;
char **seq, **seqname, **comments;
- fname = (char *) CHAR(STRING_ELT(ficname, 0));
+ fname = (char *) CHAR(STRING_ELT(ficname, 0));
PROTECT(nombreseq = NEW_INTEGER(1));
PROTECT(list = allocVector(VECSXP, 3));
@@ -692,7 +662,7 @@
/* Check that the file is available */
if((in = fopen(fname, "r")) == NULL) error("File not found");
-
+
/* How many sequences are in the file ? */
totseqs = 0;
@@ -700,8 +670,8 @@
if(*line == '>') totseqs++;
rewind(in);
- /* R objects creation */
-
+ /* R objects creation */
+
INTEGER(nombreseq)[0] = totseqs;
PROTECT(listname = allocVector(VECSXP, totseqs));
PROTECT(listseq = allocVector(VECSXP, totseqs));
@@ -711,7 +681,7 @@
seq = (char **) R_alloc(totseqs, sizeof(char *));
comments = (char **) R_alloc(totseqs, sizeof(char *));
seqname = (char **) R_alloc(totseqs, sizeof(char *));
-
+
lenseqs = MAXLENSEQ;
totseqs = -1;
i = fgets(line, sizeof(line), in);
@@ -723,15 +693,15 @@
while( i != NULL ){
totseqs++;
comments[totseqs] = NULL;
- p = line + 1;
- while(*p != '\n')
+ p = line + 1;
+ while(*p != '\n')
p++;
l = p - line - 1;
seqname[totseqs] = (char *) R_alloc(l + 1, sizeof(char));
- memcpy(seqname[totseqs], line + 1, l);
- seqname[totseqs][l] = '\0';
+ memcpy(seqname[totseqs], line + 1, l);
+ seqname[totseqs][l] = '\0';
SET_ELEMENT(listname, totseqs, mkChar(seqname[totseqs]));
seq[totseqs] = (char *) R_alloc(lenseqs + 1, sizeof(char));
@@ -759,7 +729,7 @@
SET_ELEMENT(list, 0, nombreseq);
SET_ELEMENT(list, 1, listname);
SET_ELEMENT(list, 2, listseq);
-
+
fclose(in);
UNPROTECT(4);
return list;
@@ -785,13 +755,13 @@
char line[200], *p;
int i, l = 0, curr_spec, first=TRUEL, curr_len, next_len, tot_spec, curr_max_len =0, carac, wid_name = 0;
char **seq, **comments, **seqname = NULL;
-
- fname = (char *) CHAR(STRING_ELT(ficname,0));
-
+
+ fname = (char *) CHAR(STRING_ELT(ficname,0));
+
PROTECT(nombreseq=NEW_INTEGER(1));
PROTECT(list=allocVector(VECSXP,3));
-
+
in=fopen(fname,"r");
if(in==NULL) {
@@ -799,12 +769,12 @@
return 0;
}
-
+
if(!fgets(line,sizeof(line),in) || strncmp(line,"CLUSTAL",7) != 0) { /* skip 1st line with CLUSTAL in it */
error("File not in CLUSTAL format!");
tot_spec = -1; goto fini;
}
-
+
/* skip next empty lines */
do {
carac = getc(in);
@@ -827,7 +797,7 @@
first = FALSE;
continue;
}
-
+
else if(tot_spec >= 0 && curr_spec == -1 &&
strncmp(line, seqname[0], strlen(seqname[0]) ) != 0) {
break;
@@ -839,17 +809,17 @@
}
else curr_spec++;
}
-
-
+
+
if(first && curr_spec == 0) {
/* calcul long partie nom: enlever tout ce qui n'est pas espace en fin */
- p = line + strlen(line) - 2;
- while(*p == ' ' || isdigit(*p) ) p--;
+ p = line + strlen(line) - 2;
+ while(*p == ' ' || isdigit(*p) ) p--;
while (*p != ' ') p--;
wid_name = p - line + 1;
- }
-
-
+ }
+
+
if(first) {
seqname[curr_spec] = (char *)malloc(wid_name+1);
if(seqname[curr_spec]==NULL) {
@@ -859,7 +829,7 @@
p = seqname[curr_spec] + wid_name - 1;
while(*p == ' '){
p--;
- }
+ }
*(p+1)=0;
if(curr_spec > tot_spec) tot_spec = curr_spec;
seq[curr_spec] = (char *)malloc(CLU_BLOCK_LEN+1);
@@ -871,7 +841,7 @@
}
if(curr_spec == 0) {
l = strlen(line) - 1;
- p = line + l - 1;
+ p = line + l - 1;
while(*p == ' ' || isdigit(*p) ) { p--; l--; }
l -= wid_name;
if(curr_len + l > curr_max_len) {
@@ -907,8 +877,8 @@
SET_ELEMENT(list,0,nombreseq);
SET_ELEMENT(list,1,listname);
SET_ELEMENT(list,2,listseq);
-
-
+
+
fini:
fclose(in);
UNPROTECT(4);
More information about the Seqinr-commits
mailing list