/** * KMimeMagic is inspired by the code of the * Apache Web Server. * * Rewritten for KDE by Fritz Elfert * fritz@kde.org */ #include #include #include #include #include #include #include #include #include #include #include #include #include //#include "config-kfm.h" #include "kmimemagic.h" /* * data structures and related constants */ #define MIME_MAGIC_DEBUG 0 #if MIME_MAGIC_DEBUG #define debug(fmt, args...) debugT(fmt "\n" , ## args) #else #define debug(fmt, args...) #endif #define DECLINED 999 #define ERROR 998 #define OK 0 /* * Buitltin Mime types */ #define MIME_BINARY_UNKNOWN "application/octet-stream" #define MIME_BINARY_UNREADABLE "application/x-unreadable" #define MIME_BINARY_ZEROSIZE "application/x-zerosize" #define MIME_TEXT_UNKNOWN "text/plain" #define MIME_INODE_DIR "inode/directory" #define MIME_INODE_CDEV "inode/chardevice" #define MIME_INODE_BDEV "inode/blockdevice" #define MIME_INODE_FIFO "inode/fifo" #define MIME_INODE_LINK "inode/link" #define MIME_INODE_SOCK "inode/socket" // Following should go in magic-file - Fritz #define MIME_APPL_TROFF "application/x-troff" #define MIME_APPL_TAR "application/x-tar" #define MIME_TEXT_FORTRAN "text/x-fortran" #define MAXMIMESTRING 256 #define HOWMANY 1024 /* big enough to recognize most WWW files */ #define MAXDESC 50 /* max leng of text description */ #define MAXstring 64 /* max leng of "string" types */ typedef union VALUETYPE { unsigned char b; unsigned short h; unsigned long l; char s[MAXstring]; unsigned char hs[2]; /* 2 bytes of a fixed-endian "short" */ unsigned char hl[4]; /* 2 bytes of a fixed-endian "long" */ } VALUETYPE; struct magic { struct magic *next; /* link to next entry */ int lineno; /* line number from magic file */ short flag; #define INDIR 1 /* if '>(...)' appears, */ #define UNSIGNED 2 /* comparison is unsigned */ short cont_level; /* level of ">" */ struct { char type; /* byte short long */ long offset; /* offset from indirection */ } in; long offset; /* offset to magic number */ unsigned char reln; /* relation (0=eq, '>'=gt, etc) */ char type; /* int, short, long or string. */ char vallen; /* length of string value, if any */ #define BYTE 1 #define SHORT 2 #define LONG 4 #define STRING 5 #define DATE 6 #define BESHORT 7 #define BELONG 8 #define BEDATE 9 #define LESHORT 10 #define LELONG 11 #define LEDATE 12 VALUETYPE value; /* either number or string */ unsigned long mask; /* mask before comparison with value */ char nospflag; /* supress space character */ /* NOTE: this string is suspected of overrunning - find it! */ char desc[MAXDESC]; /* description */ }; /* * data structures for tar file recognition * -------------------------------------------------------------------------- * Header file for public domain tar (tape archive) program. * * @(#)tar.h 1.20 86/10/29 Public Domain. Created 25 August 1985 by John * Gilmore, ihnp4!hoptoad!gnu. * * Header block on tape. * * I'm going to use traditional DP naming conventions here. A "block" is a big * chunk of stuff that we do I/O on. A "record" is a piece of info that we * care about. Typically many "record"s fit into a "block". */ #define RECORDSIZE 512 #define NAMSIZ 100 #define TUNMLEN 32 #define TGNMLEN 32 union record { char charptr[RECORDSIZE]; struct header { char name[NAMSIZ]; char mode[8]; char uid[8]; char gid[8]; char size[12]; char mtime[12]; char chksum[8]; char linkflag; char linkname[NAMSIZ]; char magic[8]; char uname[TUNMLEN]; char gname[TGNMLEN]; char devmajor[8]; char devminor[8]; } header; }; /* The magic field is filled with this if uname and gname are valid. */ #define TMAGIC "ustar " /* 7 chars and a null */ /* * file-function prototypes */ static int is_tar(unsigned char *, int); static unsigned long signextend(struct magic *, unsigned long); static int getvalue(struct magic *, char **); static int hextoint(int); static char *getstr(char *, char *, int, int *); static int parse(char *, int); static int mget(union VALUETYPE *, unsigned char *, struct magic *, int); static int mcheck(union VALUETYPE *, struct magic *); static int mconvert(union VALUETYPE *, struct magic *); static long from_oct(int, char *); /* * includes for ASCII substring recognition formerly "names.h" in file * command * * Original notes: names and types used by ascmagic in file(1). * These tokens are * here because they can appear anywhere in the first HOWMANY bytes, while * tokens in /etc/magic must appear at fixed offsets into the file. Don't * make HOWMANY too high unless you have a very fast CPU. */ /* these types are used calculate index to 'types': keep em in sync! */ /* HTML inserted in first because this is a web server module now */ #define L_HTML 0x001 /* HTML */ #define L_C 0x002 /* first and foremost on UNIX */ #define L_MAKE 0x004 /* Makefiles */ #define L_PLI 0x008 /* PL/1 */ #define L_MACH 0x010 /* some kinda assembler */ #define L_ENG 0x020 /* English */ #define L_PAS 0x040 /* Pascal */ #define L_JAVA 0x080 /* Java source */ #define L_CPP 0x100 /* C++ */ #define L_MAIL 0x200 /* Electronic mail */ #define L_NEWS 0x400 /* Usenet Netnews */ #define P_HTML 0 /* HTML */ #define P_C 1 /* first and foremost on UNIX */ #define P_MAKE 2 /* Makefiles */ #define P_PLI 3 /* PL/1 */ #define P_MACH 4 /* some kinda assembler */ #define P_ENG 5 /* English */ #define P_PAS 6 /* Pascal */ #define P_JAVA 7 /* Java source */ #define P_CPP 8 /* C++ */ #define P_MAIL 9 /* Electronic mail */ #define P_NEWS 10 /* Usenet Netnews */ typedef struct asc_type { char *type; int kwords; double weight; } asc_type; static asc_type types[] = { { "text/html", 10, 1.2 }, { "text/x-c", 9, 1.3 }, { "text/x-makefile", 4, 1.9 }, { "text/x-pli", 1, 3 }, { "text/x-assembler", 6, 2.1 }, { "text/english", 2, 0.2 }, { "text/x-pascal", 1, 1 }, { "text/x-java", 14, 1 }, { "text/x-c++", 14, 1 }, { "message/rfc822", 4, 1.9 }, { "message/news", 3, 2 } }; #define NTYPES (sizeof(types)/sizeof(asc_type)) static struct names { char *name; short type; } names[] = { /* These must be sorted by eye for optimal hit rate */ /* Add to this list only after substantial meditation */ { "", L_HTML }, { "", L_HTML }, { "", L_HTML }, { "", L_HTML }, { "", L_HTML }, { "<TITLE>", L_HTML }, { "<h1>", L_HTML }, { "<H1>", L_HTML }, { "<!--", L_HTML }, { "<!DOCTYPE HTML", L_HTML }, { "/*", L_C|L_CPP|L_JAVA }, /* must precede "The", "the", etc. */ { "//", L_CPP|L_JAVA }, /* must precede "The", "the", etc. */ { "#include", L_C|L_CPP }, { "char", L_C|L_CPP|L_JAVA }, { "The", L_ENG }, { "the", L_ENG }, { "double", L_C|L_CPP|L_JAVA }, { "extern", L_C|L_CPP }, { "float", L_C|L_CPP|L_JAVA }, { "real", L_C|L_CPP|L_JAVA }, { "struct", L_C|L_CPP }, { "union", L_C|L_CPP }, { "implements", L_JAVA }, { "package", L_JAVA }, { "super", L_JAVA }, { "import", L_JAVA }, { "this", L_CPP|L_JAVA }, { "class", L_CPP|L_JAVA }, { "public", L_CPP|L_JAVA }, { "private", L_CPP|L_JAVA }, { "CFLAGS", L_MAKE }, { "LDFLAGS", L_MAKE }, { "all:", L_MAKE }, { ".PRECIOUS", L_MAKE }, /* * Too many files of text have these words in them. Find another way * to recognize Fortrash. */ { ".ascii", L_MACH }, { ".asciiz", L_MACH }, { ".byte", L_MACH }, { ".even", L_MACH }, { ".globl", L_MACH }, { "clr", L_MACH }, { "(input", L_PAS }, { "dcl", L_PLI }, { "Received:", L_MAIL }, { ">From", L_MAIL }, { "Return-Path:", L_MAIL }, { "Cc:", L_MAIL }, { "Newsgroups:", L_NEWS }, { "Path:", L_NEWS }, { "Organization:", L_NEWS }, { NULL, 0 } }; /* current config */ typedef struct { char *magicfile; /* where magic be found */ struct magic *magic, /* head of magic config list */ *last; } config_rec; static int accuracy; config_rec *conf; void error( const char *msg, ... ) { va_list ap; va_start( ap, msg ); // use variable arg list QString tmp = msg; tmp += "\n"; vfprintf( stderr, tmp.data() , ap ); va_end( ap ); accuracy = 0; } #if (MIME_MAGIC_DEBUG > 1) static void test_table() { struct magic *m; struct magic *prevm = NULL; debug("%s: started", __FUNCTION__); for (m = conf->magic; m; m = m->next) { if (isprint((((unsigned long) m) >> 24) & 255) && isprint((((unsigned long) m) >> 16) & 255) && isprint((((unsigned long) m) >> 8) & 255) && isprint(((unsigned long) m) & 255)) { debug("%s: POINTER CLOBBERED! " "m=\"%c%c%c%c\" line=%d", __FUNCTION__, (((unsigned long) m) >> 24) & 255, (((unsigned long) m) >> 16) & 255, (((unsigned long) m) >> 8) & 255, ((unsigned long) m) & 255, prevm ? prevm->lineno : -1); break; } prevm = m; } } #endif #define EATAB {while (isascii((unsigned char) *l) && \ isspace((unsigned char) *l)) ++l;} static int parse_line(char *line, int *rule, int lineno) { int ws_offset; /* delete newline */ if (line[0]) { line[strlen(line) - 1] = '\0'; } /* skip leading whitespace */ ws_offset = 0; while (line[ws_offset] && isspace(line[ws_offset])) { ws_offset++; } /* skip blank lines */ if (line[ws_offset] == 0) { return 0; } /* comment, do not parse */ if (line[ws_offset] == '#') return 0; /* if we get here, we're going to use it so count it */ (*rule)++; /* parse it */ return (parse(line + ws_offset, lineno) != 0); } /* * apprentice - load configuration from the magic file. */ static int apprentice() { FILE *f; char line[BUFSIZ + 1]; int errs = 0; int lineno; int rule = 0; char *fname; if (!conf->magicfile) return -1; fname = conf->magicfile; f = fopen(fname, "r"); if (f == NULL) { error("kmimelib: can't read magic file %s: %s", fname, strerror(errno)); return -1; } /* parse it */ for (lineno = 1; fgets(line, BUFSIZ, f) != NULL; lineno++) if (parse_line(line, &rule, lineno)) errs++; fclose(f); #if (MIME_MAGIC_DEBUG > 1) debug("%s: conf=%p file=%s m=%s m->next=%s last=%s", __FUNCTION__, conf, conf->magicfile ? conf->magicfile : "NULL", conf->magic ? "set" : "NULL", (conf->magic && conf->magic->next) ? "set" : "NULL", conf->last ? "set" : "NULL"); debug("%s: read %d lines, %d rules, %d errors", __FUNCTION__, lineno, rule, errs); test_table(); #endif return (errs ? -1 : 0); } static int buff_apprentice(char *buff) { char line[BUFSIZ + 2]; int errs = 0; int lineno = 1; char *start = buff; char *end; int count = 0; int rule = 0; int len = strlen(buff) + 1; /* parse it */ do { count = (len > BUFSIZ-1)?BUFSIZ-1:len; strncpy(line, start, count); line[count] = '\0'; if ((end = strchr(line, '\n'))) { *(++end) = '\0'; count = strlen(line); } else strcat(line, "\n"); start += count; len -= count; if (parse_line(line, &rule, lineno)) errs++; lineno++; } while (len > 0); #if (MIME_MAGIC_DEBUG > 1) debug("%s: conf=%p m=%s m->next=%s last=%s", __FUNCTION__, conf, conf->magic ? "set" : "NULL", (conf->magic && conf->magic->next) ? "set" : "NULL", conf->last ? "set" : "NULL"); debug("%s: read %d lines, %d rules, %d errors", __FUNCTION__, lineno, rule, errs); test_table(); #endif return (errs ? -1 : 0); } /* * extend the sign bit if the comparison is to be signed */ static unsigned long signextend(struct magic *m, unsigned long v) { if (!(m->flag & UNSIGNED)) switch (m->type) { /* * Do not remove the casts below. They are vital. * When later compared with the data, the sign * extension must have happened. */ case BYTE: v = (char) v; break; case SHORT: case BESHORT: case LESHORT: v = (short) v; break; case DATE: case BEDATE: case LEDATE: case LONG: case BELONG: case LELONG: v = (long) v; break; case STRING: break; default: error("%s: can't happen: m->type=%d", __FUNCTION__, m->type); return ERROR; } return v; } /* * parse one line from magic file, put into magic[index++] if valid */ static int parse(char *l, int lineno) { int i = 0; struct magic *m; char *t, *s; /* allocate magic structure entry */ if ((m = (struct magic *) calloc(1, sizeof(struct magic))) == NULL) { error("%s: Out of memory.", __FUNCTION__); return -1; } /* append to linked list */ m->next = NULL; if (!conf->magic || !conf->last) { conf->magic = conf->last = m; } else { conf->last->next = m; conf->last = m; } /* set values in magic structure */ m->flag = 0; m->cont_level = 0; m->lineno = lineno; while (*l == '>') { ++l; /* step over */ m->cont_level++; } if (m->cont_level != 0 && *l == '(') { ++l; /* step over */ m->flag |= INDIR; } /* get offset, then skip over it */ m->offset = (int) strtol(l, &t, 0); if (l == t) { error("%s: offset %s invalid", __FUNCTION__, l); } l = t; if (m->flag & INDIR) { m->in.type = LONG; m->in.offset = 0; /* * read [.lbs][+-]nnnnn) */ if (*l == '.') { switch (*++l) { case 'l': m->in.type = LONG; break; case 's': m->in.type = SHORT; break; case 'b': m->in.type = BYTE; break; default: error("%s: indirect offset type %c invalid", __FUNCTION__, *l); break; } l++; } s = l; if (*l == '+' || *l == '-') l++; if (isdigit((unsigned char) *l)) { m->in.offset = strtol(l, &t, 0); if (*s == '-') m->in.offset = -m->in.offset; } else t = l; if (*t++ != ')') { error("%s: missing ')' in indirect offset", __FUNCTION__); } l = t; } while (isascii((unsigned char) *l) && isdigit((unsigned char) *l)) ++l; EATAB; #define NBYTE 4 #define NSHORT 5 #define NLONG 4 #define NSTRING 6 #define NDATE 4 #define NBESHORT 7 #define NBELONG 6 #define NBEDATE 6 #define NLESHORT 7 #define NLELONG 6 #define NLEDATE 6 if (*l == 'u') { ++l; m->flag |= UNSIGNED; } /* get type, skip it */ if (strncmp(l, "byte", NBYTE) == 0) { m->type = BYTE; l += NBYTE; } else if (strncmp(l, "short", NSHORT) == 0) { m->type = SHORT; l += NSHORT; } else if (strncmp(l, "long", NLONG) == 0) { m->type = LONG; l += NLONG; } else if (strncmp(l, "string", NSTRING) == 0) { m->type = STRING; l += NSTRING; } else if (strncmp(l, "date", NDATE) == 0) { m->type = DATE; l += NDATE; } else if (strncmp(l, "beshort", NBESHORT) == 0) { m->type = BESHORT; l += NBESHORT; } else if (strncmp(l, "belong", NBELONG) == 0) { m->type = BELONG; l += NBELONG; } else if (strncmp(l, "bedate", NBEDATE) == 0) { m->type = BEDATE; l += NBEDATE; } else if (strncmp(l, "leshort", NLESHORT) == 0) { m->type = LESHORT; l += NLESHORT; } else if (strncmp(l, "lelong", NLELONG) == 0) { m->type = LELONG; l += NLELONG; } else if (strncmp(l, "ledate", NLEDATE) == 0) { m->type = LEDATE; l += NLEDATE; } else { error("%s: type %s invalid", __FUNCTION__, l); return -1; } /* New-style anding: "0 byte&0x80 =0x80 dynamically linked" */ if (*l == '&') { ++l; m->mask = signextend(m, strtol(l, &l, 0)); } else m->mask = ~0L; EATAB; switch (*l) { case '>': case '<': /* Old-style anding: "0 byte &0x80 dynamically linked" */ case '&': case '^': case '=': m->reln = *l; ++l; break; case '!': if (m->type != STRING) { m->reln = *l; ++l; break; } /* FALL THROUGH */ default: if (*l == 'x' && isascii((unsigned char) l[1]) && isspace((unsigned char) l[1])) { m->reln = *l; ++l; goto GetDesc; /* Bill The Cat */ } m->reln = '='; break; } EATAB; if (getvalue(m, &l)) return -1; /* * now get last part - the description */ GetDesc: EATAB; if (l[0] == '\b') { ++l; m->nospflag = 1; } else if ((l[0] == '\\') && (l[1] == 'b')) { ++l; ++l; m->nospflag = 1; } else m->nospflag = 0; while ((m->desc[i++] = *l++) != '\0' && i < MAXDESC) /* NULLBODY */ ; #if (MIME_MAGIC_DEBUG > 1) debug("%s: line=%d m=%p next=%p cont=%d desc=%s", __FUNCTION__, lineno, m, m->next, m->cont_level, m->desc ? m->desc : "NULL"); #endif /* MIME_MAGIC_DEBUG */ return 0; } /* * Read a numeric value from a pointer, into the value union of a magic * pointer, according to the magic type. Update the string pointer to point * just after the number read. Return 0 for success, non-zero for failure. */ static int getvalue(struct magic *m, char **p) { int slen; if (m->type == STRING) { *p = getstr(*p, m->value.s, sizeof(m->value.s), &slen); m->vallen = slen; } else if (m->reln != 'x') m->value.l = signextend(m, strtol(*p, p, 0)); return 0; } /* * Convert a string containing C character escapes. Stop at an unescaped * space or tab. Copy the converted version to "p", returning its length in * *slen. Return updated scan pointer as function result. */ static char * getstr(register char *s, register char *p, int plen, int *slen) { char *origs = s, *origp = p; char *pmax = p + plen - 1; register int c; register int val; while ((c = *s++) != '\0') { if (isspace((unsigned char) c)) break; if (p >= pmax) { error("String too long: %s", origs); break; } if (c == '\\') { switch (c = *s++) { case '\0': goto out; default: *p++ = (char) c; break; case 'n': *p++ = '\n'; break; case 'r': *p++ = '\r'; break; case 'b': *p++ = '\b'; break; case 't': *p++ = '\t'; break; case 'f': *p++ = '\f'; break; case 'v': *p++ = '\v'; break; /* \ and up to 3 octal digits */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': val = c - '0'; c = *s++; /* try for 2 */ if (c >= '0' && c <= '7') { val = (val << 3) | (c - '0'); c = *s++; /* try for 3 */ if (c >= '0' && c <= '7') val = (val << 3) | (c - '0'); else --s; } else --s; *p++ = (char) val; break; /* \x and up to 3 hex digits */ case 'x': val = 'x'; /* Default if no digits */ c = hextoint(*s++); /* Get next char */ if (c >= 0) { val = c; c = hextoint(*s++); if (c >= 0) { val = (val << 4) + c; c = hextoint(*s++); if (c >= 0) { val = (val << 4) + c; } else --s; } else --s; } else --s; *p++ = (char) val; break; } } else *p++ = (char) c; } out: *p = '\0'; *slen = p - origp; return s; } /* Single hex char to int; -1 if not a hex char. */ static int hextoint(int c) { if (!isascii((unsigned char) c)) return -1; if (isdigit((unsigned char) c)) return c - '0'; if ((c >= 'a') && (c <= 'f')) return c + 10 - 'a'; if ((c >= 'A') && (c <= 'F')) return c + 10 - 'A'; return -1; } /* * Convert the byte order of the data we are looking at */ static int mconvert(union VALUETYPE *p, struct magic *m) { char *rt; switch (m->type) { case BYTE: return 1; case STRING: /* Null terminate and eat the return */ p->s[sizeof(p->s) - 1] = '\0'; if ((rt = strchr(p->s, '\n')) != NULL) *rt = '\0'; return 1; #ifndef WORDS_BIGENDIAN case SHORT: #endif case BESHORT: p->h = (short) ((p->hs[0] << 8) | (p->hs[1])); return 1; #ifndef WORDS_BIGENDIAN case LONG: case DATE: #endif case BELONG: case BEDATE: p->l = (long) ((p->hl[0] << 24) | (p->hl[1] << 16) | (p->hl[2] << 8) | (p->hl[3])); return 1; #ifdef WORDS_BIGENDIAN case SHORT: #endif case LESHORT: p->h = (short) ((p->hs[1] << 8) | (p->hs[0])); return 1; #ifdef WORDS_BIGENDIAN case LONG: case DATE: #endif case LELONG: case LEDATE: p->l = (long) ((p->hl[3] << 24) | (p->hl[2] << 16) | (p->hl[1] << 8) | (p->hl[0])); return 1; default: error("%s: invalid type %d", __FUNCTION__, m->type); return 0; } } static int mget(union VALUETYPE *p, unsigned char *s, struct magic *m, int nbytes) { long offset = m->offset; if (offset + (int)sizeof(union VALUETYPE) > nbytes) return 0; memcpy(p, s + offset, sizeof(union VALUETYPE)); if (!mconvert(p, m)) return 0; if (m->flag & INDIR) { switch (m->in.type) { case BYTE: offset = p->b + m->in.offset; break; case SHORT: offset = p->h + m->in.offset; break; case LONG: offset = p->l + m->in.offset; break; } if (offset + (int)sizeof(union VALUETYPE) > nbytes) return 0; memcpy(p, s + offset, sizeof(union VALUETYPE)); if (!mconvert(p, m)) return 0; } return 1; } static int mcheck(union VALUETYPE *p, struct magic *m) { register unsigned long l = m->value.l; register unsigned long v; int matched; if ((m->value.s[0] == 'x') && (m->value.s[1] == '\0')) { error("BOINK"); return 1; } switch (m->type) { case BYTE: v = p->b; break; case SHORT: case BESHORT: case LESHORT: v = p->h; break; case LONG: case BELONG: case LELONG: case DATE: case BEDATE: case LEDATE: v = p->l; break; case STRING: l = 0; /* * What we want here is: v = strncmp(m->value.s, p->s, * m->vallen); but ignoring any nulls. bcmp doesn't give * -/+/0 and isn't universally available anyway. */ v = 0; { register unsigned char *a = (unsigned char *) m->value.s; register unsigned char *b = (unsigned char *) p->s; register int len = m->vallen; while (--len >= 0) if ((v = *b++ - *a++) != 0) break; } break; default: error("%s: invalid type %d", __FUNCTION__, m->type); return 0; /* NOTREACHED */ } #if 0 debug("Before signextend %08x", v); #endif v = signextend(m, v) & m->mask; #if 0 debug("After signextend %08x", v); #endif switch (m->reln) { case 'x': matched = 1; break; case '!': matched = v != l; break; case '=': matched = v == l; break; case '>': if (m->flag & UNSIGNED) matched = v > l; else matched = (long) v > (long) l; break; case '<': if (m->flag & UNSIGNED) matched = v < l; else matched = (long) v < (long) l; break; case '&': matched = (v & l) == l; break; case '^': matched = (v & l) != l; break; default: matched = 0; error("%s: can't happen: invalid relation %d.", __FUNCTION__, m->reln); break; /* NOTREACHED */ } return matched; } /* hook for printf-type functions */ void KMimeMagic::resultBufPrintf(char *str,...) { va_list ap; char buf[MAXMIMESTRING]; /* assemble the string into the buffer */ va_start(ap, str); vsprintf(buf, str, ap); va_end(ap); /* add the buffer to the list */ resultBuf += QString(buf); } /* states for the state-machine algorithm in finishResult() */ typedef enum { rsl_leading_space, rsl_type, rsl_subtype, rsl_separator, rsl_encoding } rsl_states; /* process resultBuf and set the MIME info in magicResult */ int KMimeMagic::finishResult() { int cur_pos, /* current position within result */ type_pos, /* content type starting point: position */ type_len, /* content type length */ encoding_pos, /* content encoding starting point: position */ encoding_len; /* content encoding length */ int state; /* start searching for the type and encoding */ state = rsl_leading_space; type_pos = type_len = 0; encoding_pos = encoding_len = 0; /* loop through the characters in the result */ for (cur_pos = 0; cur_pos < (int)resultBuf.length(); cur_pos++) { if (isspace(resultBuf.at(cur_pos))) { /* process whitespace actions for each state */ if (state == rsl_leading_space) { /* eat whitespace in this state */ continue; } else if (state == rsl_type) { /* whitespace: type has no slash! */ return DECLINED; } else if (state == rsl_subtype) { /* whitespace: end of MIME type */ state++; continue; } else if (state == rsl_separator) { /* eat whitespace in this state */ continue; } else if (state == rsl_encoding) { /* whitespace: end of MIME encoding */ /* we're done */ break; } else { /* should not be possible */ /* abandon malfunctioning module */ error("%s: bad state %d (ws)", __FUNCTION__, state); return DECLINED; } /* NOTREACHED */ } else if (state == rsl_type && resultBuf.at(cur_pos) == '/') { /* copy the char and go to rsl_subtype state */ type_len++; state++; } else { /* process non-space actions for each state */ if (state == rsl_leading_space) { /* non-space: begin MIME type */ state++; type_pos = cur_pos; type_len = 1; continue; } else if (state == rsl_type || state == rsl_subtype) { /* non-space: adds to type */ type_len++; continue; } else if (state == rsl_separator) { /* non-space: begin MIME encoding */ state++; encoding_pos = cur_pos; encoding_len = 1; continue; } else if (state == rsl_encoding) { /* non-space: adds to encoding */ encoding_len++; continue; } else { /* should not be possible */ /* abandon malfunctioning module */ error("%s: bad state %d (ns)", __FUNCTION__, state); return DECLINED; } /* NOTREACHED */ } /* NOTREACHED */ } /* if we ended prior to state rsl_subtype, we had incomplete info */ if (state != rsl_subtype && state != rsl_separator && state != rsl_encoding) { /* defer to other modules */ return DECLINED; } /* save the info in the request record */ if (state == rsl_subtype || state == rsl_encoding || state == rsl_encoding || state == rsl_separator) { magicResult->setContent(resultBuf.mid(type_pos, type_len)); } if (state == rsl_encoding) magicResult->setEncoding(resultBuf.mid(encoding_pos, encoding_len)); /* detect memory allocation errors */ if (!magicResult->getContent() || (state == rsl_encoding && !magicResult->getEncoding())) { return -1; } /* success! */ return OK; } /* * magic_process - process input file fn. Opens the file and reads a * fixed-size buffer to begin processing the contents. */ void KMimeMagic::process(const char * fn) { int fd = 0; unsigned char buf[HOWMANY + 1]; /* one extra for terminating '\0' */ struct utimbuf utbuf; struct stat sb; int nbytes = 0; /* number of bytes read from a datafile */ /* * first try judging the file based on its filesystem status */ if (fsmagic(fn, &sb) != 0) { resultBuf += "\n"; return; } if ((fd = open(fn, O_RDONLY)) < 0) { /* We can't open it, but we were able to stat it. */ /* * if (sb.st_mode & 0002) addResult("writable, "); * if (sb.st_mode & 0111) addResult("executable, "); */ error("can't read `%s' (%s).", fn, strerror(errno)); resultBuf += MIME_BINARY_UNREADABLE; return; } /* * try looking at the first HOWMANY bytes */ if ((nbytes = read(fd, (char *) buf, HOWMANY)) == -1) { error("%s read failed (%s).", fn, strerror(errno)); resultBuf += MIME_BINARY_UNREADABLE; return; /* NOTREACHED */ } if (nbytes == 0) { resultBuf += MIME_BINARY_ZEROSIZE; } else { buf[nbytes++] = '\0'; /* null-terminate it */ tryit(buf, nbytes); } /* * Try to restore access, modification times if read it. */ utbuf.actime = sb.st_atime; utbuf.modtime = sb.st_mtime; (void) utime(fn, &utbuf); /* don't care if loses */ (void) close(fd); resultBuf += "\n"; } void KMimeMagic::tryit(unsigned char *buf, int nb) { /* * try tests in /etc/magic (or surrogate magic file) */ if (softmagic(buf, nb) == 1) return; /* * try known keywords, check for ascii-ness too. */ if (ascmagic(buf, nb) == 1) return; /* * abandon hope, all ye who remain here */ resultBuf += MIME_BINARY_UNKNOWN; accuracy = 0; } int KMimeMagic::fsmagic(const char *fn, struct stat *sb) { int ret = 0; /* * Fstat is cheaper but fails for files you don't have read perms on. * On 4.2BSD and similar systems, use lstat() to identify symlinks. */ ret = lstat(fn, sb); /* don't merge into if; see "ret =" above */ if (ret) { return 1; } /* * if (sb->st_mode & S_ISUID) resultBuf += "setuid "; * if (sb->st_mode & S_ISGID) resultBuf += "setgid "; * if (sb->st_mode & S_ISVTX) resultBuf += "sticky "; */ switch (sb->st_mode & S_IFMT) { case S_IFDIR: resultBuf += MIME_INODE_DIR; return 1; case S_IFCHR: resultBuf += MIME_INODE_CDEV; return 1; case S_IFBLK: resultBuf += MIME_INODE_BDEV; return 1; /* TODO add code to handle V7 MUX and Blit MUX files */ #ifdef S_IFIFO case S_IFIFO: resultBuf += MIME_INODE_FIFO;; return 1; #endif #ifdef S_IFLNK case S_IFLNK: { char buf[BUFSIZ + BUFSIZ + 4]; register int nch; struct stat tstatbuf; if ((nch = readlink(fn, buf, BUFSIZ - 1)) <= 0) { resultBuf += MIME_INODE_LINK; resultBuf += "\nunreadable"; return 1; } buf[nch] = '\0'; /* readlink(2) forgets this */ /* If broken symlink, say so and quit early. */ if (*buf == '/') { if (stat(buf, &tstatbuf) < 0) { resultBuf += MIME_INODE_LINK; resultBuf += "\nbroken"; return 1; } } else { char *tmp; char buf2[BUFSIZ + BUFSIZ + 4]; strcpy(buf2, fn); if ((tmp = strrchr(buf2, '/')) == NULL) { tmp = buf; /* in current dir */ } else { /* dir part plus (rel.) link */ *++tmp = '\0'; strcat(buf2, buf); tmp = buf2; } if (stat(tmp, &tstatbuf) < 0) { resultBuf += MIME_INODE_LINK; resultBuf += "\nbroken"; return 1; } else strcpy(buf, tmp); } if (followLinks) process(buf); else resultBuf += MIME_INODE_LINK; return 1; } return 1; #endif #ifdef S_IFSOCK #ifndef __COHERENT__ case S_IFSOCK: resultBuf += MIME_INODE_SOCK; return 1; #endif #endif case S_IFREG: break; default: error("%s: invalid mode 0%o.", __FUNCTION__, sb->st_mode); /* NOTREACHED */ } /* * regular file, check next possibility */ if (sb->st_size == 0) { resultBuf += MIME_BINARY_ZEROSIZE; return 1; } return 0; } /* * softmagic - lookup one file in database (already read from /etc/magic by * apprentice.c). Passed the name and FILE * of one file to be typed. */ int KMimeMagic::softmagic(unsigned char *buf, int nbytes) { if (match(buf, nbytes)) return 1; return 0; } /* * Go through the whole list, stopping if you find a match. Process all the * continuations of that match before returning. * * We support multi-level continuations: * * At any time when processing a successful top-level match, there is a current * continuation level; it represents the level of the last successfully * matched continuation. * * Continuations above that level are skipped as, if we see one, it means that * the continuation that controls them - i.e, the lower-level continuation * preceding them - failed to match. * * Continuations below that level are processed as, if we see one, it means * we've finished processing or skipping higher-level continuations under the * control of a successful or unsuccessful lower-level continuation, and are * now seeing the next lower-level continuation and should process it. The * current continuation level reverts to the level of the one we're seeing. * * Continuations at the current level are processed as, if we see one, there's * no lower-level continuation that may have failed. * * If a continuation matches, we bump the current continuation level so that * higher-level continuations are processed. */ int KMimeMagic::match(unsigned char *s, int nbytes) { #if (MIME_MAGIC_DEBUG > 1) int rule_counter = 0; #endif int cont_level = 0; int need_separator = 0; union VALUETYPE p; struct magic *m; #if (MIME_MAGIC_DEBUG > 1) debug("%s: conf=%p file=%s m=%s m->next=%s last=%s", __FUNCTION__, conf, conf->magicfile ? conf->magicfile : "NULL", conf->magic ? "set" : "NULL", (conf->magic && conf->magic->next) ? "set" : "NULL", conf->last ? "set" : "NULL"); for (m = conf->magic; m; m = m->next) { if (isprint((((unsigned long) m) >> 24) & 255) && isprint((((unsigned long) m) >> 16) & 255) && isprint((((unsigned long) m) >> 8) & 255) && isprint(((unsigned long) m) & 255)) { debug("%s: POINTER CLOBBERED! " "m=\"%c%c%c%c\"", __FUNCTION__, (((unsigned long) m) >> 24) & 255, (((unsigned long) m) >> 16) & 255, (((unsigned long) m) >> 8) & 255, ((unsigned long) m) & 255); break; } } #endif for (m = conf->magic; m; m = m->next) { #if (MIME_MAGIC_DEBUG > 1) rule_counter++; debug("%s: line=%d desc=%s", __FUNCTION__, m->lineno, m->desc); #endif /* check if main entry matches */ if (!mget(&p, s, m, nbytes) || !mcheck(&p, m)) { struct magic *m_cont; /* * main entry didn't match, flush its continuations */ if (!m->next || (m->next->cont_level == 0)) { continue; } m_cont = m->next; while (m_cont && (m_cont->cont_level != 0)) { #if (MIME_MAGIC_DEBUG > 1) rule_counter++; debug("%s: line=%d mc=%p mc->next=%p " "cont=%d desc=%s", __FUNCTION__, m_cont->lineno, m_cont, m_cont->next, m_cont->cont_level, m_cont->desc ? m_cont->desc : "NULL"); #endif /* * this trick allows us to keep *m in sync * when the continue advances the pointer */ m = m_cont; m_cont = m_cont->next; } continue; } /* if we get here, the main entry rule was a match */ /* this will be the last run through the loop */ #if (MIME_MAGIC_DEBUG > 1) debug("%s: rule matched, line=%d type=%d %s", __FUNCTION__, m->lineno, m->type, (m->type == STRING) ? m->value.s : ""); #endif /* print the match */ mprint(&p, m); /* * If we printed something, we'll need to print a blank * before we print something else. */ if (m->desc[0]) need_separator = 1; /* and any continuations that match */ cont_level++; /* * while (m && m->next && m->next->cont_level != 0 && ( m = * m->next )) */ m = m->next; while (m && (m->cont_level != 0)) { #if (MIME_MAGIC_DEBUG > 1) debug("%s: line=%d cont=%d type=%d %s", __FUNCTION__, m->lineno, m->cont_level, m->type, (m->type == STRING) ? m->value.s : ""); #endif if (cont_level >= m->cont_level) { if (cont_level > m->cont_level) { /* * We're at the end of the level * "cont_level" continuations. */ cont_level = m->cont_level; } if (mget(&p, s, m, nbytes) && mcheck(&p, m)) { /* * This continuation matched. Print * its message, with a blank before * it if the previous item printed * and this item isn't empty. */ /* space if previous printed */ if (need_separator && (m->nospflag == 0) && (m->desc[0] != '\0') ) { resultBuf += " "; need_separator = 0; } mprint(&p, m); if (m->desc[0]) need_separator = 1; /* * If we see any continuations at a * higher level, process them. */ cont_level++; } } /* move to next continuation record */ m = m->next; } #if (MIME_MAGIC_DEBUG > 1) debug("%s: matched after %d rules", __FUNCTION__, rule_counter); #endif return 1; /* all through */ } #if (MIME_MAGIC_DEBUG > 1) debug("%s: failed after %d rules", __FUNCTION__, rule_counter); #endif return 0; /* no match at all */ } void KMimeMagic::mprint(union VALUETYPE *p, struct magic *m) { char *pp, *rt; unsigned long v; switch (m->type) { case BYTE: v = p->b; break; case SHORT: case BESHORT: case LESHORT: v = p->h; break; case LONG: case BELONG: case LELONG: v = p->l; break; case STRING: if (m->reln == '=') { resultBufPrintf(m->desc, m->value.s); } else { resultBufPrintf(m->desc, p->s); } return; case DATE: case BEDATE: case LEDATE: pp = ctime((time_t *) & p->l); if ((rt = strchr(pp, '\n')) != NULL) *rt = '\0'; resultBufPrintf(m->desc, pp); return; default: error("%s: invalid m->type (%d)", __FUNCTION__, m->type); return; } v = signextend(m, v) & m->mask; resultBufPrintf(m->desc, (unsigned long) v); } /* an optimization over plain strcmp() */ #define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0) int KMimeMagic::ascmagic(unsigned char *buf, int nbytes) { int i; double pct, maxpct, pctsum; double pcts[NTYPES]; int mostaccurate, tokencount; int typeset, jonly, conly, jconly, cppcomm, ccomm; int has_escapes = 0; unsigned char *s; char nbuf[HOWMANY + 1]; /* one extra for terminating '\0' */ char *token; register struct names *p; int typecount[NTYPES]; /* these are easy, do them first */ accuracy = 70; /* * for troff, look for . + letter + letter or .\"; this must be done * to disambiguate tar archives' ./file and other trash from real * troff input. */ if (*buf == '.') { unsigned char *tp = buf + 1; while (isascii(*tp) && isspace(*tp)) ++tp; /* skip leading whitespace */ if ((isascii(*tp) && (isalnum(*tp) || *tp == '\\') && isascii(*(tp + 1)) && (isalnum(*(tp + 1)) || *tp == '"'))) { resultBuf += MIME_APPL_TROFF; return 1; } } if ((*buf == 'c' || *buf == 'C') && isascii(*(buf + 1)) && isspace(*(buf + 1))) { /* Fortran */ resultBuf += MIME_TEXT_FORTRAN; return 1; } /* look for tokens - this is expensive! */ /* make a copy of the buffer here because strtok() will destroy it */ s = (unsigned char *) memcpy(nbuf, buf, nbytes); s[nbytes] = '\0'; has_escapes = (memchr(s, '\033', nbytes) != NULL); /* * Fritz: * Try a little harder on C/C++/Java. */ memset(&typecount, 0, sizeof(typecount)); typeset = 0; jonly = 0; conly = 0; jconly = 0; cppcomm = 0; ccomm = 0; tokencount = 0; // first collect all possible types and count matches while ((token = strtok((char *) s, " \t\n\r\f,;")) != NULL) { s = NULL; /* make strtok() keep on tokin' */ for (p = names; p->name ; p++) { if (STREQ(p->name, token)) { tokencount++; typeset |= p->type; if (p->type == L_JAVA) jonly++; if ((p->type & (L_C|L_CPP|L_JAVA)) == (L_CPP|L_JAVA)) jconly++; if ((p->type & (L_C|L_CPP|L_JAVA)) == (L_C|L_CPP)) conly++; if (STREQ(token, "//")) cppcomm++; if (STREQ(token, "/*")) ccomm++; for (i = 0; i < (int)NTYPES; i++) if ((1 << i) & p->type) typecount[i]++; } } } if (typeset & (L_C|L_CPP|L_JAVA)) { accuracy = 40; if (!(typeset & ~(L_C|L_CPP|L_JAVA))) { if (jonly && conly) error("Oops, jonly && conly?!"); if (jonly) { // A java-only token has matched resultBuf += QString(types[P_JAVA].type); return 1; } if (jconly) { // A non-C (only C++ or Java) token has matched. if (typecount[P_JAVA] > typecount[P_CPP]) resultBuf += QString(types[P_JAVA].type); else resultBuf += QString(types[P_CPP].type); return 1; } if (conly) { // Either C or C++, rely on comments. if (cppcomm) resultBuf += QString(types[P_CPP].type); else resultBuf += QString(types[P_C].type); return 1; } if (ccomm) { resultBuf += QString(types[P_C].type); return 1; } } } /* Neither C, C++ or Java (or all of them without able to distinguish): * Simply take the token-class with the highest * matchcount > 0 */ mostaccurate = -1; maxpct = pctsum = 0.0; for (i = 0; i < (int)NTYPES; i++) { pct = (double)typecount[i] / (double)types[i].kwords * (double)types[i].weight; pcts[i] = pct; pctsum += pct; if (pct > maxpct) { maxpct = pct; mostaccurate = i; } #if MIME_MAGIC_DEBUG printf("%s has %d hits, %d kw, %f -> max = %f\n", types[i].type, typecount[i], types[i].kwords, pct, maxpct); #endif } if (mostaccurate >= 0.0) { accuracy = (int)(pcts[mostaccurate] / pctsum * 60); resultBuf += QString(types[mostaccurate].type); return 1; } switch (is_tar(buf, nbytes)) { case 1: /* V7 tar archive */ resultBuf += MIME_APPL_TAR; accuracy = 90; return 1; case 2: /* POSIX tar archive */ resultBuf += MIME_APPL_TAR; accuracy = 90; return 1; } for (i = 0; i < nbytes; i++) { if (!isascii(*(buf + i))) return 0; /* not all ascii */ } /* all else fails, but it is ascii... */ accuracy = 90; if (has_escapes) { /* text with escape sequences */ /* we leave this open for further differentiation later */ resultBuf += MIME_TEXT_UNKNOWN; } else { /* plain text */ resultBuf += MIME_TEXT_UNKNOWN; } return 1; } /* * is_tar() -- figure out whether file is a tar archive. * * Stolen (by author of file utility) from the public domain tar program: Public * Domain version written 26 Aug 1985 John Gilmore (ihnp4!hoptoad!gnu). * * @(#)list.c 1.18 9/23/86 Public Domain - gnu $Id: mod_mime_magic.c,v 1.7 * 1997/06/24 00:41:02 ikluft Exp ikluft $ * * Comments changed and some code/comments reformatted for file command by Ian * Darwin. */ #define isodigit(c) ( ((c) >= '0') && ((c) <= '7') ) /* * Return 0 if the checksum is bad (i.e., probably not a tar archive), 1 for * old UNIX tar file, 2 for Unix Std (POSIX) tar file. */ static int is_tar(unsigned char *buf, int nbytes) { register union record *header = (union record *) buf; register int i; register long sum, recsum; register char *p; if (nbytes < (int)sizeof(union record)) return 0; recsum = from_oct(8, header->header.chksum); sum = 0; p = header->charptr; for (i = sizeof(union record); --i >= 0;) { /* * We can't use unsigned char here because of old compilers, * e.g. V7. */ sum += 0xFF & *p++; } /* Adjust checksum to count the "chksum" field as blanks. */ for (i = sizeof(header->header.chksum); --i >= 0;) sum -= 0xFF & header->header.chksum[i]; sum += ' ' * sizeof header->header.chksum; if (sum != recsum) return 0; /* Not a tar archive */ if (0 == strcmp(header->header.magic, TMAGIC)) return 2; /* Unix Standard tar archive */ return 1; /* Old fashioned tar archive */ } /* * Quick and dirty octal conversion. * * Result is -1 if the field is invalid (all blank, or nonoctal). */ static long from_oct(int digs, char *where) { register long value; while (isspace(*where)) { /* Skip spaces */ where++; if (--digs <= 0) return -1; /* All blank field */ } value = 0; while (digs > 0 && isodigit(*where)) { /* Scan til nonoctal */ value = (value << 3) | (*where++ - '0'); --digs; } if (digs > 0 && *where && !isspace(*where)) return -1; /* Ended on non-space/nul */ return value; } /* * Check for file-revision suffix * * This is for an obscure document control system used on an intranet. * The web representation of each file's revision has an @1, @2, etc * appended with the revision number. This needs to be stripped off to * find the file suffix, which can be recognized by sending the name back * through a sub-request. The base file name (without the @num suffix) * must exist because its type will be used as the result. */ /* * Don't know if we really need this within KDE?! * ... well i've to look into original file code. */ const KMimeMagicResult * KMimeMagic::revision_suffix(const char * fn) { int suffix_pos; QString newfn = QString(fn); #if (MIME_MAGIC_DEBUG > 2) debug("%s: checking %s", __FUNCTION__, fn); #endif /* check for recognized revision suffix */ suffix_pos = newfn.findRev(QRegExp("@[0-9]*$")); #if (MIME_MAGIC_DEBUG > 2) debug("%s: suffix_pos=%d", __FUNCTION__, suffix_pos); #endif if (suffix_pos == -1) return NULL; return findFileType((const char *)newfn.left(suffix_pos)); } /* * The Constructor */ KMimeMagic::KMimeMagic(const char * _configfile) { int result; conf = (config_rec *)calloc(1, sizeof(config_rec)); /* set up the magic list (empty) */ conf->magic = conf->last = NULL; magicResult = NULL; followLinks = FALSE; if (_configfile) conf->magicfile = strdup(_configfile); /* on the first time through we read the magic file */ result = apprentice(); if (result == -1) return; #if (MIME_MAGIC_DEBUG > 1) test_table(); #endif } /* * The destructor. * Free the magic-table and other resources. */ KMimeMagic::~KMimeMagic() { if (conf) { struct magic *p = conf->magic; struct magic *q; while (p) { q = p; p = p->next; free(q); } free(conf); } if (magicResult) delete magicResult; } bool KMimeMagic::mergeConfig(const char * _configfile) { int result; if (conf) { char * old_magicfile = conf->magicfile; if (_configfile) conf->magicfile = strdup(_configfile); else return false; result = apprentice(); if (result == -1) { conf->magicfile = old_magicfile; return false; } #if (MIME_MAGIC_DEBUG > 1) test_table(); #endif return true; } return false; } bool KMimeMagic::mergeBufConfig(char * _configbuf) { int result; if (conf) { result = buff_apprentice(_configbuf); if (result == -1) return false; #if (MIME_MAGIC_DEBUG > 1) test_table(); #endif return true; } return false; } void KMimeMagic::setFollowLinks( bool _enable ) { followLinks = _enable; } const KMimeMagicResult * KMimeMagic::findBufferType(const char * buffer, int nbytes) { unsigned char buf[HOWMANY + 1]; /* one extra for terminating '\0' */ resultBuf.resize(0); if (magicResult) { magicResult->setContent(QString(0)); magicResult->setEncoding(QString(0)); } else magicResult = new KMimeMagicResult(); accuracy = 100; if (nbytes > HOWMANY) nbytes = HOWMANY; memcpy(buf, buffer, nbytes); if (nbytes == 0) { resultBuf += MIME_BINARY_ZEROSIZE; } else { buf[nbytes++] = '\0'; /* null-terminate it */ tryit(buf, nbytes); } resultBuf += "\n"; /* if we have any results, put them in the request structure */ finishResult(); magicResult->setAccuracy(accuracy); return magicResult; } static void refineResult(KMimeMagicResult *r, const char * _filename) { QString tmp = r->getContent(); if (tmp.isEmpty()) return; if ((strcmp(tmp, "text/x-c") == 0) || (strcmp(tmp, "text/x-c++") == 0) ) { if ( QString(_filename).right(2) == ".h" ) tmp += "hdr"; else tmp += "src"; r->setContent(tmp); } } const KMimeMagicResult * KMimeMagic::findBufferFileType( const char * buffer, int nbytes, const char * fn) { KMimeMagicResult * r = (KMimeMagicResult *)findBufferType( buffer, nbytes ); refineResult(r, fn); return r; } /* * Find the content-type of the given file. */ const KMimeMagicResult * KMimeMagic::findFileType(const char *fn) { resultBuf.resize(0); if (magicResult) { magicResult->setContent(QString(0)); magicResult->setEncoding(QString(0)); } else magicResult = new KMimeMagicResult(); accuracy = 100; /* try excluding file-revision suffixes */ if (!revision_suffix(fn)) { /* process it based on the file contents */ process(fn); } /* if we have any results, put them in the request structure */ finishResult(); magicResult->setAccuracy(accuracy); refineResult(magicResult, fn); return magicResult; }