src/uniq.c - coreutils-8.23
 Global variables defined
 
 Data types defined
 
 Functions defined
 
 Macros defined
 
 Source code
  
#include <config.h>
#include <getopt.h>
#include <sys/types.h>
#include "system.h"
#include "argmatch.h"
#include "linebuffer.h"
#include "error.h"
#include "fadvise.h"
#include "hard-locale.h"
#include "posixver.h"
#include "quote.h"
#include "stdio--.h"
#include "xmemcoll.h"
#include "xstrtol.h"
#include "memcasecmp.h"
#define PROGRAM_NAME "uniq"
#define AUTHORS \
  proper_name ("Richard M. Stallman"), \
  proper_name ("David MacKenzie")
#define SWAP_LINES(A, B)            \
  do                        \
    {                        \
      struct linebuffer *_tmp;            \
      _tmp = (A);                \
      (A) = (B);                \
      (B) = _tmp;                \
    }                        \
  while (0)
static bool hard_LC_COLLATE;
static size_t skip_fields;
static size_t skip_chars;
static size_t check_chars;
enum countmode
{
  count_occurrences,          count_none            };
static enum countmode countmode;
static bool output_unique;
static bool output_first_repeated;
static bool output_later_repeated;
static bool ignore_case;
enum delimit_method
{
    DM_NONE,
    DM_PREPEND,
    DM_SEPARATE
};
static char const *const delimit_method_string[] =
{
  "none", "prepend", "separate", NULL
};
static enum delimit_method const delimit_method_map[] =
{
  DM_NONE, DM_PREPEND, DM_SEPARATE
};
static enum delimit_method delimit_groups;
enum grouping_method
{
    GM_NONE,
    GM_PREPEND,
    GM_APPEND,
    GM_SEPARATE,
    GM_BOTH
};
static char const *const grouping_method_string[] =
{
  "prepend", "append", "separate", "both", NULL
};
static enum grouping_method const grouping_method_map[] =
{
  GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
};
static enum grouping_method grouping = GM_NONE;
enum
{
  GROUP_OPTION = CHAR_MAX + 1
};
static struct option const longopts[] =
{
  {"count", no_argument, NULL, 'c'},
  {"repeated", no_argument, NULL, 'd'},
  {"all-repeated", optional_argument, NULL, 'D'},
  {"group", optional_argument, NULL, GROUP_OPTION},
  {"ignore-case", no_argument, NULL, 'i'},
  {"unique", no_argument, NULL, 'u'},
  {"skip-fields", required_argument, NULL, 'f'},
  {"skip-chars", required_argument, NULL, 's'},
  {"check-chars", required_argument, NULL, 'w'},
  {"zero-terminated", no_argument, NULL, 'z'},
  {GETOPT_HELP_OPTION_DECL},
  {GETOPT_VERSION_OPTION_DECL},
  {NULL, 0, NULL, 0}
};
void
usage (int status)
{
  if (status != EXIT_SUCCESS)
    emit_try_help ();
  else
    {
      printf (_("\
Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
"),
              program_name);
      fputs (_("\
Filter adjacent matching lines from INPUT (or standard input),\n\
writing to OUTPUT (or standard output).\n\
\n\
With no options, matching lines are merged to the first occurrence.\n\
"), stdout);
      emit_mandatory_arg_note ();
     fputs (_("\
  -c, --count           prefix lines by the number of occurrences\n\
  -d, --repeated        only print duplicate lines, one for each group\n\
"), stdout);
     fputs (_("\
  -D, --all-repeated[=METHOD]  print all duplicate lines\n\
                          groups can be delimited with an empty line\n\
                          METHOD={none(default),prepend,separate}\n\
"), stdout);
     fputs (_("\
  -f, --skip-fields=N   avoid comparing the first N fields\n\
"), stdout);
     fputs (_("\
      --group[=METHOD]  show all items, separating groups with an empty line\n\
                          METHOD={separate(default),prepend,append,both}\n\
"), stdout);
     fputs (_("\
  -i, --ignore-case     ignore differences in case when comparing\n\
  -s, --skip-chars=N    avoid comparing the first N characters\n\
  -u, --unique          only print unique lines\n\
"), stdout);
      fputs (_("\
  -z, --zero-terminated     line delimiter is NUL, not newline\n\
"), stdout);
     fputs (_("\
  -w, --check-chars=N   compare no more than N characters in lines\n\
"), stdout);
     fputs (HELP_OPTION_DESCRIPTION, stdout);
     fputs (VERSION_OPTION_DESCRIPTION, stdout);
     fputs (_("\
\n\
A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
characters.  Fields are skipped before chars.\n\
"), stdout);
     fputs (_("\
\n\
Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
"), stdout);
      emit_ancillary_info ();
    }
  exit (status);
}
static size_t
size_opt (char const *opt, char const *msgid)
{
  unsigned long int size;
  verify (SIZE_MAX <= ULONG_MAX);
  switch (xstrtoul (opt, NULL, 10, &size, ""))
    {
    case LONGINT_OK:
    case LONGINT_OVERFLOW:
      break;
    default:
      error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
    }
  return MIN (size, SIZE_MAX);
}
static char * _GL_ATTRIBUTE_PURE
find_field (struct linebuffer const *line)
{
  size_t count;
  char const *lp = line->buffer;
  size_t size = line->length - 1;
  size_t i = 0;
  for (count = 0; count < skip_fields && i < size; count++)
    {
      while (i < size && isblank (to_uchar (lp[i])))
        i++;
      while (i < size && !isblank (to_uchar (lp[i])))
        i++;
    }
  i += MIN (skip_chars, size - i);
  return line->buffer + i;
}
static bool
different (char *old, char *new, size_t oldlen, size_t newlen)
{
  if (check_chars < oldlen)
    oldlen = check_chars;
  if (check_chars < newlen)
    newlen = check_chars;
  if (ignore_case)
    {
      FIXME      return oldlen != newlen || memcasecmp (old, new, oldlen);
    }
  else if (hard_LC_COLLATE)
    return xmemcoll (old, oldlen, new, newlen) != 0;
  else
    return oldlen != newlen || memcmp (old, new, oldlen);
}
static void
writeline (struct linebuffer const *line,
           bool match, uintmax_t linecount)
{
  if (! (linecount == 0 ? output_unique
         : !match ? output_first_repeated
         : output_later_repeated))
    return;
  if (countmode == count_occurrences)
    printf ("%7" PRIuMAX " ", linecount + 1);
  fwrite (line->buffer, sizeof (char), line->length, stdout);
}
static void
check_file (const char *infile, const char *outfile, char delimiter)
{
  struct linebuffer lb1, lb2;
  struct linebuffer *thisline, *prevline;
  if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
    error (EXIT_FAILURE, errno, "%s", infile);
  if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
    error (EXIT_FAILURE, errno, "%s", outfile);
  fadvise (stdin, FADVISE_SEQUENTIAL);
  thisline = &lb1;
  prevline = &lb2;
  initbuffer (thisline);
  initbuffer (prevline);
  
  if (output_unique && output_first_repeated && countmode == count_none)
    {
      char *prevfield IF_LINT ( = NULL);
      size_t prevlen IF_LINT ( = 0);
      bool first_group_printed = false;
      while (!feof (stdin))
        {
          char *thisfield;
          size_t thislen;
          bool new_group;
          if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
            break;
          thisfield = find_field (thisline);
          thislen = thisline->length - 1 - (thisfield - thisline->buffer);
          new_group = (prevline->length == 0
                       || different (thisfield, prevfield, thislen, prevlen));
          if (new_group && grouping != GM_NONE
              && (grouping == GM_PREPEND || grouping == GM_BOTH
                  || (first_group_printed && (grouping == GM_APPEND
                                              || grouping == GM_SEPARATE))))
            putchar (delimiter);
          if (new_group || grouping != GM_NONE)
            {
              fwrite (thisline->buffer, sizeof (char),
                      thisline->length, stdout);
              SWAP_LINES (prevline, thisline);
              prevfield = thisfield;
              prevlen = thislen;
              first_group_printed = true;
            }
        }
      if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
        putchar (delimiter);
    }
  else
    {
      char *prevfield;
      size_t prevlen;
      uintmax_t match_count = 0;
      bool first_delimiter = true;
      if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
        goto closefiles;
      prevfield = find_field (prevline);
      prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
      while (!feof (stdin))
        {
          bool match;
          char *thisfield;
          size_t thislen;
          if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
            {
              if (ferror (stdin))
                goto closefiles;
              break;
            }
          thisfield = find_field (thisline);
          thislen = thisline->length - 1 - (thisfield - thisline->buffer);
          match = !different (thisfield, prevfield, thislen, prevlen);
          match_count += match;
          if (match_count == UINTMAX_MAX)
            {
              if (count_occurrences)
                error (EXIT_FAILURE, 0, _("too many repeated lines"));
              match_count--;
            }
          if (delimit_groups != DM_NONE)
            {
              if (!match)
                {
                  if (match_count)                     first_delimiter = false;                 }
              else if (match_count == 1)
                {
                  if ((delimit_groups == DM_PREPEND)
                      || (delimit_groups == DM_SEPARATE
                          && !first_delimiter))
                    putchar (delimiter);
                }
            }
          if (!match || output_later_repeated)
            {
              writeline (prevline, match, match_count);
              SWAP_LINES (prevline, thisline);
              prevfield = thisfield;
              prevlen = thislen;
              if (!match)
                match_count = 0;
            }
        }
      writeline (prevline, false, match_count);
    }
 closefiles:
  if (ferror (stdin) || fclose (stdin) != 0)
    error (EXIT_FAILURE, 0, _("error reading %s"), infile);
  
  free (lb1.buffer);
  free (lb2.buffer);
}
enum Skip_field_option_type
  {
    SFO_NONE,
    SFO_OBSOLETE,
    SFO_NEW
  };
int
main (int argc, char **argv)
{
  int optc = 0;
  bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
  enum Skip_field_option_type skip_field_option_type = SFO_NONE;
  int nfiles = 0;
  char const *file[2];
  char delimiter = '\n';      bool output_option_used = false;   
  file[0] = file[1] = "-";
  initialize_main (&argc, &argv);
  set_program_name (argv[0]);
  setlocale (LC_ALL, "");
  bindtextdomain (PACKAGE, LOCALEDIR);
  textdomain (PACKAGE);
  hard_LC_COLLATE = hard_locale (LC_COLLATE);
  atexit (close_stdout);
  skip_chars = 0;
  skip_fields = 0;
  check_chars = SIZE_MAX;
  output_unique = output_first_repeated = true;
  output_later_repeated = false;
  countmode = count_none;
  delimit_groups = DM_NONE;
  while (true)
    {
      
      if (optc == -1
          || (posixly_correct && nfiles != 0)
          || ((optc = getopt_long (argc, argv,
                                   "-0123456789Dcdf:is:uw:z", longopts, NULL))
              == -1))
        {
          if (argc <= optind)
            break;
          if (nfiles == 2)
            {
              error (0, 0, _("extra operand %s"), quote (argv[optind]));
              usage (EXIT_FAILURE);
            }
          file[nfiles++] = argv[optind++];
        }
      else switch (optc)
        {
        case 1:
          {
            unsigned long int size;
            if (optarg[0] == '+'
                && posix2_version () < 200112
                && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
                && size <= SIZE_MAX)
              skip_chars = size;
            else if (nfiles == 2)
              {
                error (0, 0, _("extra operand %s"), quote (optarg));
                usage (EXIT_FAILURE);
              }
            else
              file[nfiles++] = optarg;
          }
          break;
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
          {
            if (skip_field_option_type == SFO_NEW)
              skip_fields = 0;
            if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
              skip_fields = SIZE_MAX;
            skip_field_option_type = SFO_OBSOLETE;
          }
          break;
        case 'c':
          countmode = count_occurrences;
          output_option_used = true;
          break;
        case 'd':
          output_unique = false;
          output_option_used = true;
          break;
        case 'D':
          output_unique = false;
          output_later_repeated = true;
          if (optarg == NULL)
            delimit_groups = DM_NONE;
          else
            delimit_groups = XARGMATCH ("--all-repeated", optarg,
                                        delimit_method_string,
                                        delimit_method_map);
          output_option_used = true;
          break;
        case GROUP_OPTION:
          if (optarg == NULL)
            grouping = GM_SEPARATE;
          else
            grouping = XARGMATCH ("--group", optarg,
                                  grouping_method_string,
                                  grouping_method_map);
          break;
        case 'f':
          skip_field_option_type = SFO_NEW;
          skip_fields = size_opt (optarg,
                                  N_("invalid number of fields to skip"));
          break;
        case 'i':
          ignore_case = true;
          break;
        case 's':
          skip_chars = size_opt (optarg,
                                 N_("invalid number of bytes to skip"));
          break;
        case 'u':
          output_first_repeated = false;
          output_option_used = true;
          break;
        case 'w':
          check_chars = size_opt (optarg,
                                  N_("invalid number of bytes to compare"));
          break;
        case 'z':
          delimiter = '\0';
          break;
        case_GETOPT_HELP_CHAR;
        case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
        default:
          usage (EXIT_FAILURE);
        }
    }
    if (grouping != GM_NONE && output_option_used)
    {
      error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
      usage (EXIT_FAILURE);
    }
  if (grouping != GM_NONE && countmode != count_none)
    {
      error (0, 0,
           _("grouping and printing repeat counts is meaningless"));
      usage (EXIT_FAILURE);
    }
  if (countmode == count_occurrences && output_later_repeated)
    {
      error (0, 0,
           _("printing all duplicated lines and repeat counts is meaningless"));
      usage (EXIT_FAILURE);
    }
  check_file (file[0], file[1], delimiter);
  exit (EXIT_SUCCESS);
}