allbery@uunet.UU.NET (Brandon S. Allbery - comp.sources.misc) (10/01/89)
Posting-number: Volume 8, Issue 67 Submitted-by: howard@dahlbeck.ericsson.se (Howard Gayle) Archive-name: cz/part03 #! /bin/sh # This is a shell archive. Remove anything before this line, then feed it # into a shell via "sh file" or similar. To overwrite existing files, # type "sh file -c". # The tool that generated this appeared in the comp.sources.unix newsgroup; # send mail to comp-sources-unix@uunet.uu.net if you want that tool. # If this archive is complete, you will see the following message at the end: # "End of archive 3 (of 14)." # Contents: 78dkus.c 78triFreq.c MakeCommon T-61.p4 b.bib bytefreq.1 # Wrapped by howard@dahlbeck on Mon Sep 25 07:15:13 1989 PATH=/bin:/usr/bin:/usr/ucb ; export PATH if test -f '78dkus.c' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'78dkus.c'\" else echo shar: Extracting \"'78dkus.c'\" \(30798 characters\) sed "s/^X//" >'78dkus.c' <<'END_OF_FILE' X/* X * 78dkus - convert Danish or (US) English from ISO 646 to ISO 8859/1 X */ X X#ifndef lint Xstatic char _cpyrgt[] = "Copyright 1989 Howard Lee Gayle"; X#endif lint X X/* X * This program is free software; you can redistribute it and/or modify X * it under the terms of the GNU General Public License version 1, X * as published by the Free Software Foundation. X * X * This program is distributed in the hope that it will be useful, X * but WITHOUT ANY WARRANTY; without even the implied warranty of X * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the X * GNU General Public License for more details. X * X * You should have received a copy of the GNU General Public License X * along with this program; if not, write to the Free Software X * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. X */ X X#include <stdio.h> X#include <howard/port.h> X#include <howard/version.h> X#include <howard/usage.h> X XMAINVER ("@(#)$Header: 78dkus.c,v 1.2 89/08/26 13:26:06 howard Exp $"); XUSAGE ("[-# shar-points] [-A attack] [-B blank-line-smoothing] [-D decay] [-b body-points] [-c colon-smoothing] [-d] [-f] [-m] [-s lines] [-t threshold]"); X X#include <ctype.h> X#include <limits.h> X#include <string.h> X#include <howard/a2.h> X#include <howard/malf.h> X#include <howard/registers.i> X#include "cz.h" X#include "78.h" X XPRIVATE byteT byte2t[256] = /* Map each byte to a trigram code.*/ X{ X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20, X21, X22, X23, X24, X25, X27, X28, X26, X32, X32, X32, X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20, X21, X22, X23, X24, X25, X27, X28, X26, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X26, X27, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X28, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X26, X27, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X32, X28, X32, X32, X32, X32, X32, X32, X32, X}; X XPRIVATE byteT dk8[256] = /* Map Danish ISO 646 to ISO 8859/1.*/ X{ X0000,/* 0/ 0 0 0 0 NUL (null) */ X0001,/* 0/ 1 1 1 1 SOH (start of heading) */ X0002,/* 0/ 2 2 2 2 STX (start of text) */ X0003,/* 0/ 3 3 3 3 ETX (end of text) */ X0004,/* 0/ 4 4 4 4 EOT (end of transmission) */ X0005,/* 0/ 5 5 5 5 ENQ (enquiry) */ X0006,/* 0/ 6 6 6 6 ACK (acknowledge) */ X0007,/* 0/ 7 7 7 7 BEL (bell) */ X0010,/* 0/ 8 10 8 8 BS (backspace) */ X0011,/* 0/ 9 11 9 9 HT (horizontal tabulation) */ X0012,/* 0/10 12 10 A LF (line feed) */ X0013,/* 0/11 13 11 B VT (vertical tabulation) */ X0014,/* 0/12 14 12 C FF (form feed) */ X0015,/* 0/13 15 13 D CR (carriage return) */ X0016,/* 0/14 16 14 E SO (shift out) */ X0017,/* 0/15 17 15 F SI (shift in) */ X0020,/* 1/ 0 20 16 10 DLE (data link escape) */ X0021,/* 1/ 1 21 17 11 DC1 (device control 1) */ X0022,/* 1/ 2 22 18 12 DC2 (device control 2) */ X0023,/* 1/ 3 23 19 13 DC3 (device control 3) */ X0024,/* 1/ 4 24 20 14 DC4 (device control 4) */ X0025,/* 1/ 5 25 21 15 NAK (negative aknowledge) */ X0026,/* 1/ 6 26 22 16 SYN (synchronous idle) */ X0027,/* 1/ 7 27 23 17 ETB (end of transmission block) */ X0030,/* 1/ 8 30 24 18 CAN (cancel) */ X0031,/* 1/ 9 31 25 19 EM (end of medium) */ X0032,/* 1/10 32 26 1A SUB (substitute character) */ X0033,/* 1/11 33 27 1B ESC (escape) */ X0034,/* 1/12 34 28 1C IS4/FS (information separator 4 / file separator)*/ X0035,/* 1/13 35 29 1D IS3/GS (information separator 3 / group separator) */ X0036,/* 1/14 36 30 1E IS2/RS (information separator 2 / record separator)*/ X0037,/* 1/15 37 31 1F IS1/US (information separator 1 / unit separator)*/ X0040,/* 2/ 0 40 32 20 space */ X0041,/* 2/ 1 41 33 21 exclamation mark */ X0042,/* 2/ 2 42 34 22 quotation mark */ X0043,/* 2/ 3 43 35 23 number sign */ X0044,/* 2/ 4 44 36 24 dollar sign */ X0045,/* 2/ 5 45 37 25 percent sign */ X0046,/* 2/ 6 46 38 26 ampersand */ X0047,/* 2/ 7 47 39 27 apostrophe */ X0050,/* 2/ 8 50 40 28 left parenthesis */ X0051,/* 2/ 9 51 41 29 right parenthesis */ X0052,/* 2/10 52 42 2A asterisk */ X0053,/* 2/11 53 43 2B plus sign */ X0054,/* 2/12 54 44 2C comma */ X0055,/* 2/13 55 45 2D hyphen, minus sign */ X0056,/* 2/14 56 46 2E full stop */ X0057,/* 2/15 57 47 2F solidus */ X0060,/* 3/ 0 60 48 30 digit zero */ X0061,/* 3/ 1 61 49 31 digit one */ X0062,/* 3/ 2 62 50 32 digit two */ X0063,/* 3/ 3 63 51 33 digit three */ X0064,/* 3/ 4 64 52 34 digit four */ X0065,/* 3/ 5 65 53 35 digit five */ X0066,/* 3/ 6 66 54 36 digit six */ X0067,/* 3/ 7 67 55 37 digit seven */ X0070,/* 3/ 8 70 56 38 digit eight */ X0071,/* 3/ 9 71 57 39 digit nine */ X0072,/* 3/10 72 58 3A colon */ X0073,/* 3/11 73 59 3B semicolon */ X0074,/* 3/12 74 60 3C less-than sign */ X0075,/* 3/13 75 61 3D equals sign */ X0076,/* 3/14 76 62 3E greater-than sign */ X0077,/* 3/15 77 63 3F question mark */ X0100,/* 4/ 0 100 64 40 commercial at */ X0101,/* 4/ 1 101 65 41 A */ X0102,/* 4/ 2 102 66 42 B */ X0103,/* 4/ 3 103 67 43 C */ X0104,/* 4/ 4 104 68 44 D */ X0105,/* 4/ 5 105 69 45 E */ X0106,/* 4/ 6 106 70 46 F */ X0107,/* 4/ 7 107 71 47 G */ X0110,/* 4/ 8 110 72 48 H */ X0111,/* 4/ 9 111 73 49 I */ X0112,/* 4/10 112 74 4A J */ X0113,/* 4/11 113 75 4B K */ X0114,/* 4/12 114 76 4C L */ X0115,/* 4/13 115 77 4D M */ X0116,/* 4/14 116 78 4E N */ X0117,/* 4/15 117 79 4F O */ X0120,/* 5/ 0 120 80 50 P */ X0121,/* 5/ 1 121 81 51 Q */ X0122,/* 5/ 2 122 82 52 R */ X0123,/* 5/ 3 123 83 53 S */ X0124,/* 5/ 4 124 84 54 T */ X0125,/* 5/ 5 125 85 55 U */ X0126,/* 5/ 6 126 86 56 V */ X0127,/* 5/ 7 127 87 57 W */ X0130,/* 5/ 8 130 88 58 X */ X0131,/* 5/ 9 131 89 59 Y */ X0132,/* 5/10 132 90 5A Z */ X0306,/* 5/11 133 91 5B left square bracket */ X0330,/* 5/12 134 92 5C reverse solidus */ X0305,/* 5/13 135 93 5D right square bracket */ X0136,/* 5/14 136 94 5E circumflex accent */ X0137,/* 5/15 137 95 5F low line, underline */ X0140,/* 6/ 0 140 96 60 grave accent */ X0141,/* 6/ 1 141 97 61 a */ X0142,/* 6/ 2 142 98 62 b */ X0143,/* 6/ 3 143 99 63 c */ X0144,/* 6/ 4 144 100 64 d */ X0145,/* 6/ 5 145 101 65 e */ X0146,/* 6/ 6 146 102 66 f */ X0147,/* 6/ 7 147 103 67 g */ X0150,/* 6/ 8 150 104 68 h */ X0151,/* 6/ 9 151 105 69 i */ X0152,/* 6/10 152 106 6A j */ X0153,/* 6/11 153 107 6B k */ X0154,/* 6/12 154 108 6C l */ X0155,/* 6/13 155 109 6D m */ X0156,/* 6/14 156 110 6E n */ X0157,/* 6/15 157 111 6F o */ X0160,/* 7/ 0 160 112 70 p */ X0161,/* 7/ 1 161 113 71 q */ X0162,/* 7/ 2 162 114 72 r */ X0163,/* 7/ 3 163 115 73 s */ X0164,/* 7/ 4 164 116 74 t */ X0165,/* 7/ 5 165 117 75 u */ X0166,/* 7/ 6 166 118 76 v */ X0167,/* 7/ 7 167 119 77 w */ X0170,/* 7/ 8 170 120 78 x */ X0171,/* 7/ 9 171 121 79 y */ X0172,/* 7/10 172 122 7A z */ X0346,/* 7/11 173 123 7B left curly bracket */ X0370,/* 7/12 174 124 7C vertical line */ X0345,/* 7/13 175 125 7D right curly bracket */ X0176,/* 7/14 176 126 7E tilde */ X0177,/* 7/15 177 127 7F DEL (delete) */ X0200,/* 8/ 0 200 128 80 */ X0201,/* 8/ 1 201 129 81 */ X0202,/* 8/ 2 202 130 82 */ X0203,/* 8/ 3 203 131 83 */ X0204,/* 8/ 4 204 132 84 */ X0205,/* 8/ 5 205 133 85 */ X0206,/* 8/ 6 206 134 86 */ X0207,/* 8/ 7 207 135 87 */ X0210,/* 8/ 8 210 136 88 */ X0211,/* 8/ 9 211 137 89 */ X0212,/* 8/10 212 138 8A */ X0213,/* 8/11 213 139 8B */ X0214,/* 8/12 214 140 8C */ X0215,/* 8/13 215 141 8D */ X0216,/* 8/14 216 142 8E */ X0217,/* 8/15 217 143 8F */ X0220,/* 9/ 0 220 144 90 */ X0221,/* 9/ 1 221 145 91 */ X0222,/* 9/ 2 222 146 92 */ X0223,/* 9/ 3 223 147 93 */ X0224,/* 9/ 4 224 148 94 */ X0225,/* 9/ 5 225 149 95 */ X0226,/* 9/ 6 226 150 96 */ X0227,/* 9/ 7 227 151 97 */ X0230,/* 9/ 8 230 152 98 */ X0231,/* 9/ 9 231 153 99 */ X0232,/* 9/10 232 154 9A */ X0233,/* 9/11 233 155 9B */ X0234,/* 9/12 234 156 9C */ X0235,/* 9/13 235 157 9D */ X0236,/* 9/14 236 158 9E */ X0237,/* 9/15 237 159 9F */ X0240,/*10/ 0 240 160 A0 NBSP (no-break space) */ X0241,/*10/ 1 241 161 A1 inverted exclamation mark */ X0242,/*10/ 2 242 162 A2 cent sign */ X0243,/*10/ 3 243 163 A3 pound sign */ X0244,/*10/ 4 244 164 A4 general currency sign */ X0245,/*10/ 5 245 165 A5 yen sign */ X0246,/*10/ 6 246 166 A6 broken vertical line */ X0247,/*10/ 7 247 167 A7 section sign */ X0250,/*10/ 8 250 168 A8 diaeresis */ X0251,/*10/ 9 251 169 A9 copyright sign */ X0252,/*10/10 252 170 AA ordinal indicator, feminine */ X0253,/*10/11 253 171 AB angle quotation mark left */ X0254,/*10/12 254 172 AC not sign */ X0255,/*10/13 255 173 AD soft hyphen */ X0256,/*10/14 256 174 AE registered sign */ X0257,/*10/15 257 175 AF macron */ X0260,/*11/ 0 260 176 B0 degree sign */ X0261,/*11/ 1 261 177 B1 plus or minus sign */ X0262,/*11/ 2 262 178 B2 superscript two */ X0263,/*11/ 3 263 179 B3 superscript three */ X0264,/*11/ 4 264 180 B4 acute accent */ X0265,/*11/ 5 265 181 B5 micro sign */ X0266,/*11/ 6 266 182 B6 pilcrow */ X0267,/*11/ 7 267 183 B7 middle dot */ X0270,/*11/ 8 270 184 B8 cedilla */ X0271,/*11/ 9 271 185 B9 superscript one */ X0272,/*11/10 272 186 BA ordinal indicator, masculine */ X0273,/*11/11 273 187 BB angle quotation mark right */ X0274,/*11/12 274 188 BC fraction one-quarter */ X0275,/*11/13 275 189 BD fraction one-half */ X0276,/*11/14 276 190 BE fraction three-quarters */ X0277,/*11/15 277 191 BF inverted question mark */ X0300,/*12/ 0 300 192 C0 capital A with grave accent */ X0301,/*12/ 1 301 193 C1 capital A with acute accent */ X0302,/*12/ 2 302 194 C2 capital A with circumflex accent */ X0303,/*12/ 3 303 195 C3 capital A with tilde */ X0304,/*12/ 4 304 196 C4 capital A with diaeresis or umlaut mark */ X0305,/*12/ 5 305 197 C5 capital A with ring */ X0306,/*12/ 6 306 198 C6 capital AE diphthong */ X0307,/*12/ 7 307 199 C7 capital C with cedilla */ X0310,/*12/ 8 310 200 C8 capital E with grave accent */ X0311,/*12/ 9 311 201 C9 capital E with acute accent */ X0312,/*12/10 312 202 CA capital E with circumflex accent */ X0313,/*12/11 313 203 CB capital E with diaeresis or umlaut mark */ X0314,/*12/12 314 204 CC capital I with grave accent */ X0315,/*12/13 315 205 CD capital I with acute accent */ X0316,/*12/14 316 206 CE capital I with circumflex accent */ X0317,/*12/15 317 207 CF capital I with diaeresis or umlaut mark */ X0320,/*13/ 0 320 208 D0 capital D with stroke, Icelandic eth */ X0321,/*13/ 1 321 209 D1 capital N with tilde */ X0322,/*13/ 2 322 210 D2 capital O with grave accent */ X0323,/*13/ 3 323 211 D3 capital O with acute accent */ X0324,/*13/ 4 324 212 D4 capital O with circumflex accent */ X0325,/*13/ 5 325 213 D5 capital O with tilde */ X0326,/*13/ 6 326 214 D6 capital O with diaeresis or umlaut mark */ X0327,/*13/ 7 327 215 D7 multiplication sign */ X0330,/*13/ 8 330 216 D8 capital O with slash */ X0331,/*13/ 9 331 217 D9 capital U with grave accent */ X0332,/*13/10 332 218 DA capital U with acute accent */ X0333,/*13/11 333 219 DB capital U with circumflex accent */ X0334,/*13/12 334 220 DC capital U with diaeresis or umlaut mark */ X0335,/*13/13 335 221 DD capital Y with acute accent */ X0336,/*13/14 336 222 DE capital thorn, Icelandic */ X0337,/*13/15 337 223 DF small sharp s, German */ X0340,/*14/ 0 340 224 E0 small a with grave accent */ X0341,/*14/ 1 341 225 E1 small a with acute accent */ X0342,/*14/ 2 342 226 E2 small a with circumflex accent */ X0343,/*14/ 3 343 227 E3 small a with tilde */ X0344,/*14/ 4 344 228 E4 small a with diaeresis or umlaut mark */ X0345,/*14/ 5 345 229 E5 small a with ring */ X0346,/*14/ 6 346 230 E6 small ae diphthong */ X0347,/*14/ 7 347 231 E7 small c with cedilla */ X0350,/*14/ 8 350 232 E8 small e with grave accent */ X0351,/*14/ 9 351 233 E9 small e with acute accent */ X0352,/*14/10 352 234 EA small e with circumflex accent */ X0353,/*14/11 353 235 EB small e with diaeresis or umlaut mark */ X0354,/*14/12 354 236 EC small i with grave accent */ X0355,/*14/13 355 237 ED small i with acute accent */ X0356,/*14/14 356 238 EE small i with circumflex accent */ X0357,/*14/15 357 239 EF small i with diaeresis or umlaut mark */ X0360,/*15/ 0 360 240 F0 small d with stroke, Icelandic eth */ X0361,/*15/ 1 361 241 F1 small n with tilde */ X0362,/*15/ 2 362 242 F2 small o with grave accent */ X0363,/*15/ 3 363 243 F3 small o with acute accent */ X0364,/*15/ 4 364 244 F4 small o with circumflex accent */ X0365,/*15/ 5 365 245 F5 small o with tilde */ X0366,/*15/ 6 366 246 F6 small o with diaeresis or umlaut mark */ X0367,/*15/ 7 367 247 F7 division sign */ X0370,/*15/ 8 370 248 F8 small o with slash */ X0371,/*15/ 9 371 249 F9 small u with grave accent */ X0372,/*15/10 372 250 FA small u with acute accent */ X0373,/*15/11 373 251 FB small u with circumflex accent */ X0374,/*15/12 374 252 FC small u with diaeresis or umlaut mark */ X0375,/*15/13 375 253 FD small y with acute accent */ X0376,/*15/14 376 254 FE small thorn, Icelandic */ X0377,/*15/15 377 255 FF small y with diaeresis or umlaut mark */ X}; X X#include "78common.h" X X/* Different sections in a file: */ X#define S_HDR 1 /* News article header.*/ X#define S_BODY 2 /* News article body.*/ X#define S_SIG 3 /* News article signature.*/ X XPRIVATE double attack = 0.65; /* Smoothing factor.*/ XPRIVATE double blank = 0.7; /* Scale attack/decay on blank lines.*/ XPRIVATE double bodval = -200.0; /* Score at start of body.*/ XPRIVATE double colon = 0.5; /* Scale attack/decay after colon.*/ XPRIVATE boolT debug = FALSE; /* Debug flag.*/ XPRIVATE double decay = 0.67; /* Smoothing factor.*/ XPRIVATE boolT fixbody = FALSE; /* Ordinary file, no header or signature.*/ XPRIVATE double headval = 0.0; /* For header values.*/ XPRIVATE boolT mailbox = FALSE; /* Converting a mailbox.*/ XPRIVATE double pound1 = -350.0; /* After # at beginning of line.*/ XPRIVATE unsigned siglns = 9; /* Max lines in a signature.*/ XPRIVATE double thresh = 0.0; /* Score above this is Danish.*/ XPRIVATE triDifT dkustt[TRIMAX + 1];/* Trigram difference table.*/ X XPRIVATE bStrT dkwords[] = /* These are always Danish.*/ X { X NULBSTR X }; X XPRIVATE bStrT uswords[] = /* These are never Danish.*/ X { X S("[]"), X S("[The"), X NULBSTR X }; X X#include "78heur.h" X X/* dkus - run heuristics on one file */ X XPRIVATE void dkus (is, fn) XR9 streamT is; /* Input stream.*/ X bStrT fn; /* File name.*/ X X/* Function: X * Copy file to standard output, converting to ISO 8859/1. X * Algorithm: X * Read each line. Switch on section and look for section X * transitions. Step through the line. Look for section matches. X * Call dif78() on each word. Compute score. If word looks Danish, X * convert it. Write line. X * Returns: X * X * Notes: X * X */ X{ XR2 rcharT b; /* Current input byte.*/ XR4 int i; /* General putpose.*/ X double cum = 0.0; /* Cumulative score.*/ X unsigned ln = 0; /* Input line number.*/ X int lns = -1; /* Value from Lines: header field; -1 = unknown.*/ XR8 unsigned sigln = 1; /* Line number in signature.*/ XR5 bStrT p1; /* Rest of line after special match.*/ XR1 bStrT lp; /* Steps through lb[].*/ XR7 boolT sigbeg; /* Line looks like start of signature.*/ XR3 bStrT wp = NULBSTR; /* Points to start of word.*/ XR6 unsigned sect; /* Current section.*/ X byteT lb[MLINE + 1]; /* Line buffer.*/ X Xsect = (fixbody ? S_BODY : S_HDR); Xlb[0] = ' '; Xwhile (NULBSTR != (getlin ((lp = &lb[1]), MLINE, is, fn, &ln, 0))) X { X if (mailbox && (NULBSTR != prefix (S("From "), lp))) X { X cum = 0.0; X ln = 1; X lns = -1; X sect = S_HDR; X sigln = 1; X sigbeg = FALSE; X } X else X sigbeg = SigBegP (lp); X switch (sect) X { X case S_HDR: X if (EOS == B(*lp)) X { X sect = S_BODY; X cum = bodval; X ln = 0; X } X else X { X if (NULBSTR != (p1 = prefix (S("Lines: "), lp))) X (void) a2i (p1, NULBSTR, TRUE, &lns, (bStrT *) NULL); X if (NULBSTR != (p1 = bStrChr (lp, ':'))) X { X cum = headval; X lp = p1 + 1; X } X } X break; X case S_BODY: X if (sigbeg || (!fixbody && (lns > siglns) && (ln > (lns - siglns)))) X sect = S_SIG; X else X { X for (; '>' == B(*lp); ++lp) X ; X if (('#' == B(*lp)) || ('X' == B(*lp))) X cum = MIN (cum, pound1); X } X break; X case S_SIG: X if (sigbeg) X sigln = 1; X else if (sigln <= siglns) X ++sigln; X else X { X sigln = 1; X sect = S_BODY; X } X break; X default: X malf1 (eIntern, "dkus 1"); X break; X } X if (EOS == B(*lp)) cum *= blank * ((cum > thresh) ? decay : attack); X do X { X b = B(*lp); X if ((NULBSTR != (p1 = BraceP (lp, sect))) || X (NULBSTR != (p1 = UunetP (lp))) || X (NULBSTR != (p1 = IPP (lp))) || X (NULBSTR != (p1 = InArtP (lp, sect))) || X (NULBSTR != (p1 = GrafP (lp, sect))) || X (NULBSTR != (p1 = PipeP (lp, sect))) || X (NULBSTR != (p1 = EndP (lp, sect, S(") writes:")))) || X (NULBSTR != (p1 = EndP (lp, sect, S(" \\n\\")))) || X (NULBSTR != (p1 = LaTeXP (lp)))) X { X lp = p1; X wp = NULBSTR; X } X else X { X if (byte2t[b] <= TRIHI) X { X if (NULBSTR == wp) wp = lp; X } X else X { X if (NULBSTR != wp) X { X i = dif78 (wp, lp, dkustt); X cum *= ((i > 0) ? attack : decay); X cum += i; X if (((cum > thresh) && !wordp (wp, lp, uswords)) || X wordp (wp, lp, dkwords)) X { X for (p1 = wp; p1 != lp; ++p1) X *p1 = dk8[B(*p1)]; X } X if (debug) X FPRINTF (stderr, "%c%6.0f %.*s\n", "?hbsf"[sect], cum, X lp - wp, wp); X wp = NULBSTR; X if (':' == b) X cum *= colon * ((cum > thresh) ? decay : attack); X } X } X ++lp; X } X } X while (EOS != b); X puts (&lb[1]); X } X} X X/* main - main function */ X XPUBLIC int main (argc, argv) X int argc; /* Number of arguments.*/ XR3 bStrT *argv; /* Points to array of argument strings.*/ X X/* Function: X * X * Algorithm: X * Decode args. Initialize. Call dkus(). X * Notes: X * X */ X X{ XR1 rcharT c; /* Option letter.*/ XR2 bStrT cp; /* Steps through args.*/ Xextern int optind; /* See getopt (3).*/ Xextern cStrT optarg; /* See getopt (3).*/ X Xwhile (EOF != (c = getopt (argc, (cStrT *) argv, "#:A:B:D:b:c:dfh:ms:t:"))) X { X switch (c) X { X case '?': X usage(); X break; X case '#': X pound1 = ma2d ((bStrT) optarg, NULBSTR, FALSE, "# Value", X (bStrT *) NULL); X break; X case 'A': X attack = mra2d ((bStrT) optarg, NULBSTR, FALSE, "Attack", 0.001, 0.999, X (bStrT *) NULL); X break; X case 'B': X blank = ma2d ((bStrT) optarg, NULBSTR, FALSE, S("Blank smoothing"), X (bStrT *) NULL); X break; X case 'D': X decay = mra2d ((bStrT) optarg, NULBSTR, FALSE, "Decay", 0.001, 0.999, X (bStrT *) NULL); X break; X case 'b': X bodval = ma2d ((bStrT) optarg, NULBSTR, FALSE, "Body Value", X (bStrT *) NULL); X break; X case 'c': X colon = ma2d ((bStrT) optarg, NULBSTR, FALSE, S("Colon Smoothing"), X (bStrT *) NULL); X break; X case 'd': X debug = TRUE; X break; X case 'f': X fixbody = TRUE; X break; X case 'h': X headval = ma2d ((bStrT) optarg, NULBSTR, FALSE, S("Header Value"), X (bStrT *) NULL); X break; X case 'm': X mailbox = TRUE; X break; X case 's': X siglns = mra2u ((bStrT) optarg, NULBSTR, FALSE, "Max signature lines", X (unsigned) 1, (unsigned) 99, (bStrT *) NULL); X break; X case 't': X thresh = ma2d ((bStrT) optarg, NULBSTR, FALSE, "Threshold", X (bStrT *) NULL); X break; X default: X malf1 (eIntern, "main 1"); X break; X } X } Xargv += optind; Xcp = *argv++; Xif (NULBSTR != cp) usage(); Xipath(); Xmrdtri (S("dkus"), (bStrT) dkustt); Xdkus (stdin, S("Standard Input")); Xmfflush (stdout, "Standard Output"); Xexit (SUCCESS); X X#ifdef lint Xreturn (SUCCESS); X#endif X} END_OF_FILE if test 30798 -ne `wc -c <'78dkus.c'`; then echo shar: \"'78dkus.c'\" unpacked with wrong size! fi # end of '78dkus.c' fi if test -f '78triFreq.c' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'78triFreq.c'\" else echo shar: Extracting \"'78triFreq.c'\" \(4268 characters\) sed "s/^X//" >'78triFreq.c' <<'END_OF_FILE' X/* X * 78triFreq - compute trigram frequencies X */ X X#ifndef lint Xstatic char _cpyrgt[] = "Copyright 1989 Howard Lee Gayle"; X#endif lint X X/* X * This program is free software; you can redistribute it and/or modify X * it under the terms of the GNU General Public License version 1, X * as published by the Free Software Foundation. X * X * This program is distributed in the hope that it will be useful, X * but WITHOUT ANY WARRANTY; without even the implied warranty of X * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the X * GNU General Public License for more details. X * X * You should have received a copy of the GNU General Public License X * along with this program; if not, write to the Free Software X * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. X */ X X#include <stdio.h> X#include <howard/port.h> X#include <howard/version.h> X#include <howard/usage.h> X XMAINVER ("@(#)$Header: 78triFreq.c,v 2.2 89/08/25 11:15:35 howard Exp $"); XUSAGE ("encodings < filenames > trigram-frequencies"); X X#include <string.h> X#include <howard/malf.h> X#include <howard/registers.i> X#include "cz.h" X#include "78.h" X#include "78code.h" X XPRIVATE double tt; /* Total number of trigrams.*/ XPRIVATE double tricnt[TRIMAX + 1]; /* Trigram counts.*/ X X/* doword - process one word */ X XPRIVATE void doword (wp) XR1 bStrT wp; /* Points to word.*/ X X/* Function: X * Increment the count for each trigram in the word. X * Algorithm: X * Step through the word. X * Returns: X * X * Notes: X * 1) Word must already be converted to trigram codes. X * 2) wp[-1] == TRIBEG. X * 3) Word is terminated with TRIEND. X */ X{ Xdo X { X tricnt[TOTRI (wp[0], wp[1], wp[2])] += 1.0; X tt += 1.0; X ++wp; X } Xwhile (TRIEND != wp[1]); X} X X/* main - main function */ X XPUBLIC int main (argc, argv) X int argc; /* Number of arguments.*/ XR4 bStrT *argv; /* Points to array of argument strings.*/ X X/* Function: X * X * Algorithm: X * Loop reading each line from stdin. Treat it as a path name and X * open it. Call doword() on each word in file. X * Compute scaling factor (s) for an implied denominator of 2^31 - 1. X * Compute max frequency (m) and number of digits to print it (d). X * Loop through trigram count vector and output each frequency, along X * with the trigram in printable form. X * Notes: X * 1) Every 1000 input files, write the file number and total trigrams X * counted so far. This shows progress is being made, since X * running it on a lot of files can take hours. X */ X X{ XR5 int d; /* Max digits in count.*/ XR1 int i; /* General purpose.*/ XR3 streamT is; /* Input stream.*/ X unsigned ln = 0; /* Input line number.*/ X double m = 0; /* Max frequency.*/ X double s; /* Scaling factor.*/ XR2 bStrT wp; /* Steps through wb[].*/ X byteT fnb[MFILE]; /* File name buffer.*/ X byteT wb[MLINE]; /* Word buffer.*/ X Xif (2 != argc) usage(); Xrdcode (argv[1]); Xwb[0] = TRIBEG; Xwp = &wb[1]; Xwhile (NULBSTR != getlin (fnb, MFILE, stdin, S("Standard input"), &ln, 0)) X { X is = fopen (fnb, "r"); X if (NULSTRM == is) X malf0 ("%s: Can not open", fnb); X else X { X while (EOF != (i = getc (is))) X { X i = byte2t[i]; X if (i <= TRIHI) X *wp++ = i; X else X { X if (wp != &wb[1]) X { X *wp = TRIEND; X doword (wb); X wp = &wb[1]; X } X } X } X if (ferror (is)) malf1 ("%s: Read error", fnb); X mfclose (is, fnb); X } X if (0 == (ln % 1000)) X { X FPRINTF (stderr, "%u %.0f\n", ln, tt); X mfflush (stderr, S("Standard Error")); X } X } Xif (0.0 != tt) X { X PRINTF ("; Total trigrams: %.0f\n\n", tt); X s = 2147483647.0 / tt; /* 2^31 - 1.*/ X for (i = 0; i <= TRIMAX; ++i) X if (tricnt[i] > m) m = tricnt[i]; X SPRINTF ((cStrT) wb, "%.0f", m * s); X d = strlen ((cStrT) wb); X for (i = 0; i <= TRIMAX; ++i) X { X if (0.0 != tricnt[i]) X PRINTF ("%*.0f %c%c%c\n", d, tricnt[i] * s, X t2byte[31 & (i >> 5)], t2byte[i >> 10], t2byte[31 & i]); X } X mfflush (stdout, S("Standard Output")); X } Xexit (SUCCESS); X X#ifdef lint Xreturn (SUCCESS); X#endif X} END_OF_FILE if test 4268 -ne `wc -c <'78triFreq.c'`; then echo shar: \"'78triFreq.c'\" unpacked with wrong size! fi # end of '78triFreq.c' fi if test -f 'MakeCommon' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'MakeCommon'\" else echo shar: Extracting \"'MakeCommon'\" \(1187 characters\) sed "s/^X//" >'MakeCommon' <<'END_OF_FILE' X# MakeCommon - common definitions for uMakefile and Makefile for cz X# X# $Header$ X# X# Copyright 1989 Howard Lee Gayle X# This file is written in the ISO 8859/1 character set. X# X# This program is free software; you can redistribute it and/or modify X# it under the terms of the GNU General Public License version 1, X# as published by the Free Software Foundation. X# X# This program is distributed in the hope that it will be useful, X# but WITHOUT ANY WARRANTY; without even the implied warranty of X# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the X# GNU General Public License for more details. X# X# You should have received a copy of the GNU General Public License X# along with this program; if not, write to the Free Software X# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. X X# Copy stdin to stdout. XCAT=cat X X# Change mode of a file. XCHMOD=chmod X X# Get a file from the distribution directory. XDISTI=mkDistI X X# Macro processor. XM4=m4 X X# Remove a file. XRM=zap -f X X# Stream editor. XSED=sed X X# Uncompress if necessary and check out from RCS or SCCS. XUNCMPRS=mkUncmprs X X# C include file search path. XINCLUDES=-I/usr/local/local-include -I/usr/local/free/howard/0/include END_OF_FILE if test 1187 -ne `wc -c <'MakeCommon'`; then echo shar: \"'MakeCommon'\" unpacked with wrong size! fi # end of 'MakeCommon' fi if test -f 'T-61.p4' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'T-61.p4'\" else echo shar: Extracting \"'T-61.p4'\" \(2817 characters\) sed "s/^X//" >'T-61.p4' <<'END_OF_FILE' X% T.61.p4 - PostScript for cz0 for CCITT T.61. X% X% $Header: T-61.p4,v 1.1 89/08/04 16:41:23 howard Exp $ X% X% Copyright 1989 Howard Lee Gayle X% This file is written in the ISO 8859/1 character set. X% X% This program is free software; you can redistribute it and/or modify X% it under the terms of the GNU General Public License version 1, X% as published by the Free Software Foundation. X% X% This program is distributed in the hope that it will be useful, X% but WITHOUT ANY WARRANTY; without even the implied warranty of X% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the X% GNU General Public License for more details. X% X% You should have received a copy of the GNU General Public License X% along with this program; if not, write to the Free Software X% Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. X Xinclude(ps-abbrev.m4) Xinclude(T-61.m4) X X/CharHeight % (character) CharHeight height X {% def X gsave X newpath X 0 0 moveto X true charpath flattenpath pathbbox X grestore X 4 1 roll PPOP PPOP PPOP X }PBIND PDEF X X/@ % (accent) (character) @ X {% def X /AC exch PDEF % Accented character. X /DM exch PDEF % Diacritical mark. X currentpoint PPOP % Push current x. X AC PSHOW % Show character to be accented. X gsave % Come back here after printing accent. X currentpoint PPOP % Push current x. X sub % Negative width of character. X DM stringwidth PPOP % Push width of accent. X sub % Negative width of character plus accent. X 2 div % Half. X AC CharHeight X (x)CharHeight X sub % Subtract x height. X rmoveto % Move to left and up. X DM PSHOW % Add accent. X grestore X }PBIND PDEF X X/@Y % (accent) (character) delta-y @Y X {% def X /DY exch PDEF % Delta Y. X /AC exch PDEF % Accented character. X /DM exch PDEF % Diacritical mark. X currentpoint PPOP % Push current x. X AC PSHOW % Show character to be accented. X gsave % Come back here after printing accent. X currentpoint PPOP % Push current x. X sub % Negative width of character. X DM stringwidth PPOP % Push width of accent. X sub % Negative width of character plus accent. X 2 div % Half. X DY Size mul X rmoveto % Move to left and up. X DM PSHOW % Add accent. X grestore X }PBIND PDEF X X/@0{0 @Y}PBIND PDEF X X/Kern % (second-char) distance (first-char) Kern X {% def X PSHOW X Size mul neg 0 rmoveto X PSHOW X }PBIND def X X/@O % (accent) DeltaY DeltaX (character) Overlap X {% def X currentpoint PPOP % (accent) DeltaY DeltaX (character) current-x X exch PSHOW % (accent) DeltaY DeltaX current-x X gsave % Come back here after printing accent. X currentpoint PPOP % (accent) DeltaY DeltaX old-x current-x X sub % (accent) DeltaY DeltaX -width X exch Size mul add % (accent) DeltaY dx X exch Size mul % (accent) dx dy X rmoveto % (accent) X PSHOW X grestore X }PBIND PDEF END_OF_FILE if test 2817 -ne `wc -c <'T-61.p4'`; then echo shar: \"'T-61.p4'\" unpacked with wrong size! fi # end of 'T-61.p4' fi if test -f 'b.bib' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'b.bib'\" else echo shar: Extracting \"'b.bib'\" \(1926 characters\) sed "s/^X//" >'b.bib' <<'END_OF_FILE' X% b.bib - bibliography X% X% Copyright 1989 Howard Lee Gayle X% This file is written in the ISO 8859/1 character set. X% X% $Header: b.bib,v 1.5 89/09/22 07:44:48 howard Exp $ X% X% This program is free software; you can redistribute it and/or modify X% it under the terms of the GNU General Public License version 1, X% as published by the Free Software Foundation. X% X% This program is distributed in the hope that it will be useful, X% but WITHOUT ANY WARRANTY; without even the implied warranty of X% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the X% GNU General Public License for more details. X% X% You should have received a copy of the GNU General Public License X% along with this program; if not, write to the Free Software X% Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. X@book{Auden, X author = "W. H. Auden", X title = "Selected poetry of W. H. Auden", X publisher = "Vintage Books", X year = "1971", X address = "New York", X edition = "second"} X@misc{Gunnel, X author = {Gunnel K{\"a}llgren}, X howpublished = "Personal communcation", X month = "August", X year = "1989"} X@manual{ISO:646, X title = "International Standard ISO 646, X Information processing---ISO 7-bit X coded character set for information interchange", X organization = "International Organization for Standardization", X edition = "second", X month = "July", X year = "1983"} X@manual{SWASCII, X title = "Svensk standard SS~63~61~27, X Data representation---Swedish 7-bit X coded character set for data interchange", X organization = "Standardiseringskommissionen i Sverige", X edition = "second", X month = "April", X year = "1984"} X@manual{Xerox, X title = "Xerox System Integration Standard XSIS~058404, X Character Code Standard", X organization = "Xerox Corporation", X address = "Stamford, Connecticut 10904, USA", X month = "April", X year = "1984"} END_OF_FILE if test 1926 -ne `wc -c <'b.bib'`; then echo shar: \"'b.bib'\" unpacked with wrong size! fi # end of 'b.bib' fi if test -f 'bytefreq.1' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'bytefreq.1'\" else echo shar: Extracting \"'bytefreq.1'\" \(1607 characters\) sed "s/^X//" >'bytefreq.1' <<'END_OF_FILE' X.\" $Header: bytefreq.1,v 1.1 89/08/09 16:22:25 howard Exp $ X.TH BYTEFREQ 1 "$Revision: 1.1 $" X.SH NAME Xbytefreq \- frequency-count the bytes in a list of files X.SH SYNOPSIS X.B bytefreq X.RB [ " \-b " ] X.RB [ " \-d " ] X< X.I filenames X.SH COPYRIGHT XCopyright \(co 1989 Howard Lee Gayle X.SH DESCRIPTION X.I Bytefreq Xreads from standard input a list of file names, one per line. XIt generates a frequency table of the bytes in the files. XThe X.B \-b Xoption outputs the table in byte order. XThe X.B \-d Xoption outputs the table in descending frequency order. XBoth options may be used. X.SH EXAMPLE XGenerate a frequency table, in frequency order, of the bytes in Xall comp news articles: X.nf X % find /usr/spool/news/comp \-type f \-name '[0\-9]*' \-print | bytefreq \-d X.fi X.SH "SEE ALSO" X.IR 78 (1). X.SH LICENSE XThis program is free software; you can redistribute it and/or modify Xit under the terms of the GNU General Public License version 1, Xas published by the Free Software Foundation. X.PP XThis program is distributed in the hope that it will be useful, Xbut WITHOUT ANY WARRANTY; without even the implied warranty of XMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the XGNU General Public License for more details. X.PP XYou should have received a copy of the GNU General Public License Xalong with this program; if not, write to the Free Software XFoundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. X.SH AUTHOR XHoward Gayle, XTN/ETX/T/BG, XEricsson Telecom AB, XS-126 25 Stockholm, XSweden, Xhoward@ericsson.se, Xuunet!ericsson.se!howard, XPhone: +46 8 719 5565, XFAX: +46 8 719 9598, XTelex: 14910 ERIC S END_OF_FILE if test 1607 -ne `wc -c <'bytefreq.1'`; then echo shar: \"'bytefreq.1'\" unpacked with wrong size! fi # end of 'bytefreq.1' fi echo shar: End of archive 3 \(of 14\). cp /dev/null ark3isdone MISSING="" for I in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ; do if test ! -f ark${I}isdone ; then MISSING="${MISSING} ${I}" fi done if test "${MISSING}" = "" ; then echo You have unpacked all 14 archives. rm -f ark[1-9]isdone ark[1-9][0-9]isdone else echo You still need to unpack the following archives: echo " " ${MISSING} fi ## End of shell archive. exit 0