Compute first set of a grammar
authorThadeu Lima de Souza Cascardo <cascardo@dcc.ufmg.br>
Wed, 28 Sep 2005 05:05:20 +0000 (05:05 +0000)
committerThadeu Lima de Souza Cascardo <cascardo@dcc.ufmg.br>
Wed, 28 Sep 2005 05:05:20 +0000 (05:05 +0000)
Compute the first set for every nonterminal of the grammar.

git-archimport-id: cascardo@tlscascardo--private/libgrammatic--dev--0.1--patch-8

first.c [new file with mode: 0644]
first.h [new file with mode: 0644]

diff --git a/first.c b/first.c
new file mode 100644 (file)
index 0000000..febcc2e
--- /dev/null
+++ b/first.c
@@ -0,0 +1,497 @@
+#include <grammar.h>
+#include <stdio.h>
+#include <assert.h>
+
+typedef struct
+{
+  gboolean has_empty;
+  GHashTable* terminals;
+  GList* rules;
+  GList* recursive;
+} first_set_t;
+
+first_set_t* first_set_new ()
+{
+  first_set_t* first_set;
+  first_set = g_malloc (sizeof (first_set_t));
+  first_set->has_empty = FALSE;
+  first_set->terminals = g_hash_table_new_full (symbol_hash, symbol_equal,
+                                               g_free, NULL);
+  first_set->rules = NULL;
+  first_set->recursive = NULL;
+  return first_set;
+}
+
+void first_set_delete (first_set_t* first_set)
+{
+  GList* l;
+  g_hash_table_destroy (first_set->terminals);
+  l = first_set->rules;
+  while (l != NULL)
+    {
+      rule_delete (l->data);
+      l = g_list_next (l);
+    }
+  g_list_free (first_set->rules);
+  l = first_set->recursive;
+  while (l != NULL)
+    {
+      rule_delete (l->data);
+      l = g_list_next (l);
+    }
+  g_list_free (first_set->recursive);
+  g_free (first_set);
+}
+
+void first_set_insert_terminal (first_set_t* first_set, symbol_t* symbol)
+{
+  g_hash_table_insert (first_set->terminals, symbol, NULL);
+}
+
+void first_set_insert (gpointer key, gpointer val, gpointer data)
+{
+  g_hash_table_insert (data, symbol_copy (key), NULL);
+}
+
+GHashTable* first_set_union (GHashTable* a, GHashTable* b)
+{
+  g_hash_table_foreach (b, first_set_insert, a);
+  return a;
+}
+
+void first_advance_recursive (first_set_t* first_set, symbol_t* symbol)
+{
+
+  GList* rules;
+  rules = first_set->recursive;
+
+  /*
+   * For each recursive rule, remove recursiveness and insert any new
+   * rule to list of non-recursive rules.
+   */
+
+  while (rules != NULL)
+    {
+
+      rule_t* rule;
+      GList* symbols;
+      symbol_t* first_symbol;
+
+      rule = rules->data;
+      symbols = grammar_get_rule (rule);
+
+      assert (symbols != NULL);
+
+      first_symbol = symbols->data;
+
+      assert (first_symbol->value == symbol->value);
+
+      /* Remove any leading symbols equal to the recursive nonterminal */
+      while (first_symbol->value == symbol->value)
+       {
+         first_symbol = rule_pop (rule);
+
+         /* If no symbol remains after removing leading recursive
+          * nonterminal, it may produce the empty string */
+         if (first_symbol == NULL)
+           {
+             first_set->has_empty = TRUE;
+             rule_delete (rule);
+             break;
+           }
+         /* If after removing leading recursive nonterminal, there
+          * is a terminal, it is in the first set of the nonterminal */
+         else if (first_symbol->terminal)
+           {
+             first_set_insert_terminal (first_set,
+                                        symbol_copy (first_symbol));
+             rule_delete (rule);
+             break;
+           }
+         /*
+          * If after removing leading recursive nonterminal, there
+          * is a nonterminal, the first of the remaining sequence
+          * of symbols is in the first set of the recursive
+          * nonterminal
+          */
+         else if (first_symbol->value != symbol->value)
+           {
+             first_set->rules = g_list_append (first_set->rules, rule);
+             break;
+           }
+       }
+
+      rules = g_list_next (rules);
+
+    }
+
+  g_list_free (first_set->recursive);
+  first_set->recursive = NULL;
+
+}
+
+void first_prepare (gpointer key, gpointer val, gpointer data)
+{
+
+  symbol_t* symbol;
+  GList* rules;
+  GHashTable* first;
+
+  first_set_t* first_set;
+
+  symbol = (symbol_t*) key;
+  rules = *(GList**) val;
+  first = (GHashTable*) data;
+
+  assert (symbol->terminal == FALSE);
+
+  /* For each nonterminal in the grammar, there is a first set */
+  first_set = first_set_new ();
+  g_hash_table_insert (first, symbol_copy (symbol), first_set);
+
+  /*
+   * For each rule for the nonterminal in the grammar, it will either
+   * produce the empty string directly, start with a terminal that will
+   * be included in the first set, start with a different nonterminal or
+   * be a recursive rule.
+   */
+  while (rules != NULL)
+    {
+
+      rule_t* rule;
+      GList* symbols;
+      symbol_t* first_symbol;
+
+      rule = rules->data;
+      symbols = grammar_get_rule (rule);
+
+      if (symbols == NULL)
+       {
+         first_set->has_empty = TRUE;
+       }
+      else
+       {
+         first_symbol = symbols->data;
+         if (first_symbol->terminal)
+           {
+             first_set_insert_terminal (first_set,
+                                        symbol_copy (first_symbol));
+           }
+         else if (first_symbol->value == symbol->value)
+           {
+             first_set->recursive = g_list_prepend (first_set->recursive,
+                                                    rule_copy (rule));
+           }
+         else
+           {
+             first_set->rules = g_list_prepend (first_set->rules,
+                                                rule_copy (rule));
+           }
+       }
+
+      rules = g_list_next (rules);
+
+    }
+
+  /*
+   * If there any no non-recursive rules starting with a nonterminal
+   * recursiveness may be easily eliminated.
+   */
+  if (first_set->rules == NULL && first_set->recursive != NULL)
+    {
+      /*
+       * If the nonterminal may produce the empty string, each recursive
+       * rule will lead to a new rule with the leading recursive nonterminal
+       * removed.
+       */
+      if (first_set->has_empty)
+       {
+         first_advance_recursive (first_set, symbol);
+       }
+      /* If it does not produce the empty string, every recursive rule
+       * may be simply removed. All terminals in first set are known at
+       * this point.
+       */
+      else
+       {
+         GList* l;
+         l = first_set->recursive;
+         while (l != NULL)
+           {
+             rule_delete (l->data);
+             l = g_list_next (l);
+           }
+         g_list_free (first_set->recursive);
+         first_set->recursive = NULL;
+       }
+      assert (first_set->recursive == NULL);
+    }
+      
+}
+
+void first_iterate (gpointer key, gpointer val, gpointer data)
+{
+
+  symbol_t* symbol;
+  first_set_t* first_set;
+  GHashTable* first;
+
+  GList* rules;
+  GList* new_rules;
+
+  symbol = (symbol_t*) key;
+  first_set = (first_set_t*) val;
+  first = (GHashTable*) data;
+
+  assert (symbol->terminal == FALSE);
+
+  /* If there are no non-recursive rules starting with a
+   * nonterminal, there's not much to do. Only remove all
+   * the recursive rules or advance them as described in
+   * the first_prepare.
+   */
+  if (first_set->rules == NULL)
+    {
+      if (first_set->recursive != NULL)
+       {
+         if (first_set->has_empty)
+           {
+             first_advance_recursive (first_set, symbol);
+           }
+         else
+           {
+             GList* l;
+             l = first_set->recursive;
+             while (l != NULL)
+               {
+                 rule_delete (l->data);
+                 l = g_list_next (l);
+               }
+             g_list_free (first_set->recursive);
+             first_set->recursive = NULL;
+           }
+         assert (first_set->recursive == NULL);
+       }
+      return;
+    }
+  else
+    {
+
+      rules = first_set->rules;
+      new_rules = NULL;
+
+      /*
+       * For each rule, try to eliminate it by checking the first set of
+       * the nonterminal with starts it. Rules may have been added by
+       * previous iterations or the advancement of recursive rules.
+       * So, trivial checks like empty rules and rules starting with
+       * terminals should be checked too.
+       */
+      while (rules != NULL)
+       {
+
+         rule_t* rule;
+         GList* symbols;
+         symbol_t* first_symbol;
+
+         rule = rules->data;
+         symbols = grammar_get_rule (rule);
+
+         /* If it is an empty rule, nonterminal may produce the empty
+          * string */
+         if (symbols == NULL)
+           {
+             first_set->has_empty = TRUE;
+             rule_delete (rule);
+           }
+         else
+           {
+
+             first_set_t* next_set;
+
+             first_symbol = symbols->data;
+
+             /* If it starts with a terminal, include it in the first
+              * set and remove the rule. */
+             if (first_symbol->terminal)
+               {
+                 first_set_insert_terminal (first_set,
+                                            symbol_copy (first_symbol));
+                 rule_delete (rule);
+               }
+             /* Get the first set of the nonterminal which starts the rule */
+             else if (g_hash_table_lookup_extended (first, first_symbol, NULL,
+                                                    (gpointer*) &next_set))
+               {
+                 /*
+                  * If nonterminal produces the empty string, the first set
+                  * of the sequence of symbols next to it is in the first
+                  * set of the nonterminal we are treating. Add a new rule
+                  * corresponding to this sequence of symbols. This is the
+                  * rule which says: if there is a production A -> y1 y2 yk
+                  * and the empty string is in the first of y1, then first
+                  * of y2 yk is in first of A.
+                  */
+                 if (next_set->has_empty)
+                   {
+                     rule_t* new_rule;
+                     new_rule = rule_copy (rule);
+                     rule_pop (new_rule);
+                     new_rules = g_list_prepend (new_rules, new_rule);
+                   }
+                 /* The terminals in the first of the leading nonterminal in
+                  * the rule are in the first of our current nonterminal */
+                 first_set_union (first_set->terminals,
+                                  next_set->terminals);
+                 /*
+                  * If there are other rules starting with a nonterminal
+                  * for this nonterminal, they should be copied to the
+                  * current nonterminal, so we can detect and treat indirect
+                  * recursiveness properly.
+                  */
+                 if (next_set->rules != NULL)
+                   {
+                     /* FIXME: not much of an advance here, watch for
+                      * this in a possible unstopabble loop */
+                     /* FIXME: copy recursive rules too, with care */
+                     GList* next_rules;
+                     next_rules = next_set->rules;
+                     while (next_rules != NULL)
+                       {
+                         rule_t* next_rule = rule_copy (next_rules->data);
+                         GList* next_symbols;
+                         next_symbols = grammar_get_rule (next_rule);
+                         /*
+                          * If the rule is empty, both terminals produce the
+                          * empty set. This will be detected for the other
+                          * nonterminal later. Optimization could be done
+                          * here. TODO
+                          */
+                         if (next_symbols == NULL)
+                           {
+                             first_set->has_empty = TRUE;
+                             rule_delete (next_rule);
+                           }
+                         else
+                           {
+                             symbol_t* next_symbol;
+                             next_symbol = next_symbols->data;
+                             /* TODO: Check this assertion is correct. */
+                             assert (next_symbol->terminal == FALSE);
+                             /* This is an indirect recursive rule. */
+                             if (next_symbol->value == symbol->value)
+                               {
+                                 first_set->recursive =
+                                   g_list_prepend (first_set->recursive,
+                                                   next_rule);
+                               }
+                             /* Next time we treat current nonterminal,
+                              * we will check this rule directly. */
+                             else
+                               {
+                                 new_rules = g_list_prepend (new_rules, next_rule);
+                               }
+                           }
+                         next_rules = g_list_next (next_rules);
+                       }
+                   }
+                 /*
+                  * If nonterminal has recursive rules, we should expect
+                  * more rules to add here and so we will check it again
+                  * in the next iteration. Keep the rule. If there is only
+                  * indirect recursiveness, we will detect it through the
+                  * rules we have added before. We should remove it in this
+                  * case so we make advance.
+                  */
+                 if (next_set->recursive != NULL)
+                   {
+                     new_rules = g_list_prepend (new_rules, rule);
+                   }
+                 else
+                   {
+                     rule_delete (rule);
+                   }
+               }
+             else
+               {
+                 assert (TRUE);
+               }
+
+           }
+
+         rules = g_list_next (rules);
+
+       }
+
+      g_list_free (first_set->rules);
+      first_set->rules = new_rules;
+
+    }
+
+}
+
+void first_check (gpointer key, gpointer val, gpointer data)
+{
+
+  symbol_t* symbol;
+  first_set_t* first_set;
+  gboolean* stop;
+
+  symbol = (symbol_t*) key;
+  first_set = (first_set_t*) val;
+  stop = (gboolean*) data;
+
+  assert (symbol->terminal == FALSE);
+
+  /* If there is anything besides terminals in the first set,
+   * we must iterate more and remove all of them. */
+  if (first_set->rules != NULL || first_set->recursive != NULL)
+    {
+      *stop = FALSE;
+    }
+
+}
+
+/*
+ * Calculate the first set for every nonterminal symbol in the grammar.
+ * We should iterate through the rules for each nonterminal until only
+ * terminals are known to be in the first set of it.
+ */
+GHashTable* grammar_first (Grammar* grammar)
+{
+  GHashTable* first;
+  gboolean stop;
+  first = g_hash_table_new_full (symbol_hash, symbol_equal,
+                                g_free,
+                                first_set_delete);
+  g_hash_table_foreach (grammar->grammar, first_prepare, first);
+  do
+    {
+      g_hash_table_foreach (first, first_iterate, first);
+      stop = TRUE;
+      g_hash_table_foreach (first, first_check, &stop);
+    } while (stop == FALSE);
+  return first;
+}
+
+void put_key_on_list (gpointer key, gpointer val, gpointer data)
+{
+  GList** l;
+  l = (GList**) l;
+  *l = g_list_prepend (*l, key);
+}
+
+GList* first_get (GHashTable* first, symbol_t* symbol)
+{
+
+  first_set_t* first_set;
+  GList* l;
+  l = NULL;
+  if (g_hash_table_lookup_extended (first, symbol, NULL,
+                                   (gpointer*) &first_set))
+    {
+      g_hash_table_foreach (first->terminals, put_key_on_list, &l);
+    }
+  return l;
+
+}
diff --git a/first.h b/first.h
new file mode 100644 (file)
index 0000000..1f0794c
--- /dev/null
+++ b/first.h
@@ -0,0 +1,9 @@
+#ifndef FIRST_H
+#define FIRST_H
+
+#include <grammar.h>
+
+GHashTable* grammar_first (Grammar*);
+GList* first_get (GHashTable*, symbol_t*);
+
+#endif