2023-07-04

Read .txt file with C program and parse how to handle empty fields

I try to parse some data of a .txt file, and the format is not really easy to handle. The separator is a space character. The file contains one field with variable length, this is the fifth column from the right side. Therefore, until the fourth column I parse the date from left, then I begin to parse the data from right, until I reach the field with the variable length. This works ok. But my main problem is that I have sometimes fields where is nothing in there, see third row column 3. Because of that with my code I cannot parse the data accurately. In the output file the parsing is not ok for all lines. Is there a possibility to skip empty fields so that sscanf can recognize this fields? It would be very nice if someone can give me a tip how I can parse the data correctly. Code at onlinegdb: https://onlinegdb.com/8rOBlIfMhU

enter image description here

#include <stdio.h>
#include <string.h>

#define BUF 1500

// reverse a string
char *strrev(char *str)
{
      char *p1, *p2;

      if (! str || ! *str)
            return str;
      for (p1 = str, p2 = str + strlen(str) - 1; p2 > p1; ++p1, --p2)
      {
            *p1 ^= *p2;
            *p2 ^= *p1;
            *p1 ^= *p2;
      }
      return str;
}

int main()
{
    FILE *ptr=fopen("/tmp/abc123", "w");
    fputs(
   "10000   07/01/1986   68391610   68391610   OPTIMUM MANUFACTURING INC             OMFGA          7952    10                 10396      3       3         3990       3990     39       399    03/12/1986   OMFGA                     Q          A         R       -2.56250         1000         .             2.75000        2.37500        .             .             .       C             C           3680       2    30/01/1986      .          .            .         .              .                 .       .          .             .          .             .             .        1.00000     1.00             .            .            .        .        .      1        1        9       2       0.013809     0.013800     0.011061     0.011046     0.014954\n"
   "12781   30/11/1970   84857L10   50558810   LACLEDE GAS CO                        LG            21080    11                     0      1       1         4925       2741     27       274             .                             N          A         R       25.00000         3500         .            25.00000       24.00000        .             .             .      0.041667      0.041667     4141       .             .      .          .            .         .              .                 .       .          .             .          .             .             .        4.00000     4.0              .            .            .        .        .      .        .        .       .       0.016698     0.016439     0.021276     0.020949     0.014779\n"
   "13901   27/05/1955   02209S10              PHILIP MORRIS & CO LTD                              21398    11                     0      1       1         2110       2111     21       211             .                             N          A         R       42.00000         4400       40.87500       42.00000       40.87500        .             .             .      0.030675      0.030675      2887       .             .      .          .            .         .              .                 .       .          .             .          .             .             .        2479.29   576.000            .            .            .        .        .      .        .        .       .       0.001626     0.001543     0.001477     0.001381      .\n"    
   "13901   31/05/1955   02209S10              PHILIP MORRIS & CO LTD                              21398    11                     0      1       1         2110       2111     21       211             .                             N          A         R       41.37500         5600       42.12500       42.12500       41.00000        .             .             .     -0.014881     -0.014881      2887       .             .      .          .            .         .              .                 .       .          .             .          .             .             .        2479.29   576.000            .            .            .        .        .      .        .        .       .       0.000496     0.000165    -0.000448    -0.000851      .\n"    
   "13901   01/06/1955   02209S10              PHILIP MORRIS INC                                   21398    11                     0      1       1         2110       2111     21       211    01/07/1962                             N          A         R       40.00000        11300       40.87500       40.87500       40.00000        .             .             .     -0.033233     -0.033233      2887       2    29/12/1955      .          .            .         .              .                 .       .          .             .          .             .             .        2479.29   576.000            .            .            .        .        .      .        .        .       .       0.001683     0.001476    -0.000496    -0.000724      .\n"      
   "13901   02/06/1955   02209S10              PHILIP MORRIS INC                                   21398    11                     0      1       1         2110       2111     21       211             .                             N          A         R       39.87500         9600       40.00000       40.12500       39.87500        .             .             .     -0.003125     -0.003125      2887       .             .      .          .            .         .              .                 .       .          .             .          .             .             .        2479.29   576.000            .            .            .        .        .      .        .        .       .       0.003036     0.002973     0.002027     0.001912      .\n"      
   "13901   03/06/1955   02209S10              PHILIP MORRIS INC                                   21398    11                     0      1       1         2110       2111     21       211             .                             N          A         R       40.12500         5500       40.00000       40.62500       40.00000        .             .             .      0.006270      0.006270      2887       .             .      .          .            .         .              .                 .       .          .             .          .             .             .        2479.29   576.000            .            .            .        .        .      .        .        .       .       0.006440     0.006420     0.004233     0.004141      .\n"      
  ,ptr);
    fclose(ptr);

    FILE *fp, *fpp;
    fp=fopen("/tmp/abc123","r");
        char puffer[BUF];
        char a[1000],b[1000],c[1000],d[1000],e[1000],f[1000],g[1000],h[1000],i[1000],j[1000],k[1000],l[1000],m[1000],n[1000],o[1000],p[1000],q[1000],r[1000],s[1000],tt[1000],u[1000],v[1000]; // a->PERMNO; b->date; c->CUSIP; d->NCUSIP; e->COMNAM; f->DIVAMT; g->CFACPR
        char w[1000],x[1000],y[1000],z[1000],aa[1000],ab[1000],ac[1000],ad[1000],ae[1000],af[1000],ag[1000],ah[1000],ai[1000],aj[1000],ak[1000],al[1000],am[1000],an[1000],ao[1000],ap[1000];
        char aq[1000],ar[1000],as[1000],at[1000],au[1000],av[1000],aw[1000],ax[1000],ay[1000],az[1000],ba[1000],bb[1000],bc[1000],bd[1000],be[1000],bf[1000],bg[1000],bh[1000],bi[1000],bj[1000],bk[1000],bl[1000] ;
      
    fpp=fopen("output.txt","w");

    if(fpp==NULL)
    {
        printf("file could not be opened\n");
        return 1;
    }
    
  while(fgets(puffer, BUF, fp) != NULL)
    {
        int n1,n2;
        char t[1000];
        //parse first four columns from left side
        if( 4==sscanf(puffer,"%s%s%s%s%n",a,b,c,d,&n1) )
        //parse 57 cloumns from the right side
        if( 57 ==sscanf(strrev(strcpy(t,puffer)),"%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%n",bl,bk,bj,bi,bh,bg,bf,be,bd,bc,bb,ba,az,ay,ax,aw,av,au,at,as,ar,aq,ap,ao,an,am,al,ak,aj,ai,ah,ag,af,ae,ad,ac,ab,aa,z,y,x,w,v,u,tt,s,r,q,p,o,n,m,l,k,j,i,h,g,f,&n2));
        //parse the variable field, is simply what is left in the middle.
        if( 1==sscanf(puffer+n1+1,"%[^\n]",e) )
        e[strlen(e)-n2]=0,a,b,c,d,e;
                strrev(f), strrev(g),strrev(h), strrev(i), strrev(j), strrev(k),
                strrev(l),strrev(m), strrev(n), strrev(o), strrev(p), strrev(q);
                strrev(r), strrev(s), strrev(tt), strrev(u), strrev(v), strrev(w),
                strrev(x), strrev(y), strrev(z), strrev(aa), strrev(ab), strrev(ac),
                strrev(ad), strrev(ae), strrev(af), strrev(ag), strrev(ah), strrev(ai),
                strrev(aj), strrev(ak),strrev(al), strrev(am), strrev(an), strrev(ao),
                strrev(ap), strrev(aq), strrev(ar), strrev(as), strrev(at), strrev(au),
                strrev(av), strrev(aw), strrev(ax), strrev(ay), strrev(az), strrev(ba);
                strrev(bb), strrev(bc), strrev(bd), strrev(be), strrev(bf), strrev(bg);
                strrev(bh), strrev(bi);
        // print first 5 columns in the console
         printf("%s %s %s %s %s\n",a, b, c, d, e);
        // print all parsed columns in output.txt file
         fprintf(fpp,"%s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s; %s\n",a,b,c,d,e,f, g, h, i, j, k, l, m ,n, o,p,q,r,s,tt,u,v,w,x,y,z,aa,ab,ac,ad,ae,af,ag,ah,ai,aj,ak,al,am,an,ao,ap,aq,ar,as,at,au,av,aw,ax,ay,az,ba,bb,bc,bd,be,bf,bg,bh,bi);
       
    }
     
    fclose(ptr);
    return 0;
}



No comments:

Post a Comment