/* This is from induce_rules.p */ /* -- The program code starts here --------------------------------------- */ ;;; First some utility procedures. define instance_category(item) -> category; ;;; the second element of an instance (item) is its category. ;;; could be redefined. lvars item, category; item(2) -> category enddefine; define instance_values(item) -> vals; ;;; Given an instance extract its list of test values lvars item, vals; hd(item) -> vals; enddefine; define instance_value(item, index) -> val; ;;; given an instance and a test index, return the value of ;;; that test for that instance. lvars item, index, val; instance_values(item)(index) -> val; enddefine; define index_of(test) -> index; ;;; the second element of a test specification is its index lvars test, index; test(2) -> index enddefine; define test_name(test) -> name; lvars test, name; hd(test) -> name enddefine; define test_values(test) -> vals; ;;; the test values come after the first two elements lvars test, vals; tl(tl(test)) -> vals enddefine; define all_same_category(instances) -> category; ;;; Does every instance get assigned to the same category? ;;; If so return the category, otherwise false lvars instances, category, item; destpair(instances) -> (item, instances); instance_category(item) -> category; for item in instances do unless instance_category(item) = category do false -> category; return; endunless; endfor; enddefine; /* ;;; tests all_same_category( [ [[1 2] A] [[2 3] A] [[4 5] A]]) => ** A all_same_category( [ [[1 2] A] [[2 3] A] [[4 5] B]]) => ** */ define getall(test, test_val, instances) ->(this_branch, rest); ;;; For a given test, e.g. test3 and its possible value e.g. t3a, ;;; find all the instances that have that value for that test, ;;; and return a list of them as this_branch, leaving the remaining ;;; instances in the list rest, which is also returned lvars test, test_val, instances, this_branch = [], rest = [] ; lvars item, index = index_of(test); for item in instances do ;;; Get the test value in position index in this instance, ;;; and compare it with test_value, then decide which list ;;; to put the the instance into if instance_values(item)(index) == test_val then [^item ^^this_branch] -> this_branch else [^item ^^rest] -> rest endif endfor; ;;; Above, We built the lists up from the left end for efficiency, ;;; so the items will be in reverse order. ;;; Now restore the original order by reversing the lists. ;;; It is safe to use fast_ncrev (to save garbage collections) when ;;; we know we have built up properly constructed lists. fast_ncrev(this_branch) -> this_branch; fast_ncrev(rest) -> rest enddefine; /* ;;; test vars test1 = [A 1 a1 a2 a3], test2 = [B 2 b1 b2 b3], data = [ [[a1 b1] X] [[a2 b1] Y] [[a3 b1] Y] [[a1 b2] X] [[a2 b2] Y] [[a3 b2] X] [[a1 b3] X] [[a2 b3] Y] [[a3 b3] Y] ]; ;;; NB rest will print first in these two tests. Get all with "a2" getall(test1, "a2", data) ==> ==> ** [[[a1 b1] X] [[a3 b1] Y] [[a1 b2] X] [[a3 b2] X] [[a1 b3] X] [[a3 b3] Y]] ** [[[a2 b1] Y] [[a2 b2] Y] [[a2 b3] Y]] ;;; get all whose outcome for test2 has value "b3" getall(test2, "b3", data) ==> ==> ** [[[a1 b1] X] [[a2 b1] Y] [[a3 b1] Y] [[a1 b2] X] [[a2 b2] Y] [[a3 b2] X]] ** [[[a1 b3] X] [[a2 b3] Y] [[a3 b3] Y]] */ /* -- The main procedure: induce_rules ----------------------------------- */ define induce_rules(tests, instances) -> tree; ;;; for specification see top of file lvars tests, instances, tree; lvars test, category, test_val, this_branch; if all_same_category(instances) ->> category then category -> tree; elseif tests == [] then ;;; A set of instances have come out with the same results for ;;; all tests, yet they don't have the same category label. so ;;; they are inconsistent. ;;; We could cause an error message, but it is more interesting ;;; not to. ;;; Return a special sub tree with the inconsistent instances ;;; collected in a vector (useful for showtree) [INCONSISTENT {^^instances} ] -> tree; else ;;; Start building the tree. Recursion will do the main work [% ;;; Get the first test, and divide the instances according ;;; to the values they have for that test, and handle ;;; other tests by recursion hd(tests) -> test; ;;; Put test label at top of tree test_name(test), ;;; build subtrees for each possible test value for test_val in test_values(test) do quitif(instances == []); getall(test, test_val, instances) ->(this_branch,instances); unless this_branch == [] then ;;; Build a partial tree for items having this ;;; test_val for this test [%test_val, induce_rules(tl(tests), this_branch)%] endunless; endfor %] -> tree; ;;; If the tree has only one branch the test used is spurious. ;;; so pull up the lower level sub tree and return that, except ;;; where the tree has the label "INCONSISTENT" ;;; Try commenting out the next three lines and see what difference ;;; it makes to the tests above! while islist(tree) and hd(tree) /== "INCONSISTENT" and listlength(tl(tree)) == 1 do hd(tl(tree)) -> tree endwhile; ;;; check that all the instances got used! if instances /== [] then mishap('Unrecognized instance value?',[^test ^instances]) endif endif enddefine; /* -- A program to turn the decision tree into an expert system Once you have a decision tree of the sort produced by induce_rules you can use it with the procedure classify_instance defined below to interrogate a user about a new instance, by asking a series of questions, using the answers to traverse the tree, and ending with a classification of the new instance. The program is fairly rigid as it does not make use of any "don't know" answers, or any test values other than those found in data used to create the original tree. */ define classify_instance(tree) -> category; lvars tree, category; if atom(tree) then ;;; found the answer, as a result of recursing down the tree tree -> category else ;;; the tree will have a test type and a list of subtrees each ;;; with a test-value at its root. lvars test_type, subtrees, vals, subtree, answer, count = 1; destpair(tree) -> (test_type, subtrees); ;;; Make a list of possible test values, with numeric codes [%' ', for subtree in subtrees do ;;; count sys_>< ":", hd(subtree), ' ', count + 1 -> count; endfor, ;;; Replace last string with newline erase(), newline%] -> vals; repeat ['For test of type <' ^test_type '> what value does the instance have?'] => 'Please type the numeric code for the value in this list'=> applist(vals, pr); 'If you don\'t know or it is not one of these, just press RETURN' => readline() -> answer; if answer == [] then false -> category; return() endif; if listlength(answer) == 1 then hd(answer) -> answer endif; quitif(isnumber(answer) and answer > 0 and answer <= listlength(subtrees)); 'Sorry, that\'s not one of the acceptable numbers. Please try again' => endrepeat; ;;; Use the number code to select the subtree and recurse ;;; on the head of its tail (because of the structure of the ;;; output of induce_rules hd(tl(subtrees(answer))) -> subtree; if islist(subtree) and hd(subtree) == "INCONSISTENT" then 'Data for this case inconsistent' else classify_instance(subtree) endif -> category endif; enddefine; /* -- Further exercises -------------------------------------------------- 1. Create several large data-sets and see how well induce rules performs with them. Can you construct a set of data and a set of tests such that induce_rules produces a rather silly decision tree, e.g. one for each each question only sets a single instance apart from the rest, so that for identification of some instances you have to ask a great many questions whereas for others you have to ask very few. 2. Try reading the literature on ID3 and see if you can change the program so as to produce a more balanced decision tree. 3. Try to decide whether the above algorithm provides a sound basis for predicting the outcome of new cases on the basis of old examples. Is there any way you could improve its reliability? 4. How does a program like induce_rules compare with training a neural net on examples? 5. Under what conditions can a tree produced by induce_rules classify an instance with features that it has never seen before. 6. Produce a version of classify_instance, which, instead of interrogating the user is given a decision tree and an instance and produces a category for the instance (or false if it cannot). 7. As shown by the animals example near the beginning of this file, the output of induce_rules for the same set of instances can be different depending on the order in which the test specifications are given. Sometimes the two decision trees produced for a given set of data and a given set of test specifications will include different tests, or will use different subsets of possible values for a given test (like the animals example above). This can imply that the data show that there are different sufficient conditions for something to be classified, and the above procedure induce_rules will not find them all. Can you fix this? 8. Can you extend classify_instance so that if the current decision tree does not enable you to classify an instance, you can have a little dialogue in which you say how it should be classified, as a result of which classify_instance modifies the decision tree? The easiest way would be to make it add the new instance to the original list of instances and then recreate the decision tree from the enlarged list. Is there a better way to extend the decision tree incrementally? 9. Consider how induce_rules deals with inconsistent data. I.e. it produces a node in the decision tree recording the inconsistency. Are there better ways it could handle such data? (Presumably it will depend on the problem domain.) 10. The procedure classify_instances has a very poor user interface. What is wrong with it? Can you improve it? Would it be better to use pop-up dialogue boxes ? (See HELP * POPUPTOOL) */