Distinct Characters
May 9, 2017
The quadratic solution uses two nested loops to pair each character to every other character:
(define (distinct? str)
(call-with-current-continuation
(lambda (return)
(let ((cs (string->list str)))
(do ((cs cs (cdr cs))) ((null? cs) #t)
(do ((ds (cdr cs) (cdr ds))) ((null? ds))
(when (char=? (car cs) (car ds))
(return #f))))))))
> (distinct? "Programming")
#f
> (distinct? "Praxis")
#t
We use a common Scheme idiom call-with-current-continuation to define return that performs an early return from the function as soon as it finds a non-distinct character.
The O(n log n) solution sorts the characters in the string, then runs through them to determine if there are any adjacent duplicates:
(define (distinct? str)
(let loop ((cs (sort char<? (string->list str))))
(if (null? (cdr cs)) #t
(if (char=? (car cs) (cadr cs)) #f
(loop (cdr cs))))))
> (distinct? "Programming")
#f
> (distinct? "Praxis")
#t
The linear solution uses a hash table to store characters, checking before each insertion if the character is already present:
(define (distinct? str)
(let ((letters (make-eq-hashtable)))
(let loop ((cs (string->list str)))
(if (null? cs) #t
(if (hashtable-ref letters (car cs) #f) #f
(begin (hashtable-set! letters (car cs) 1)
(loop (cdr cs))))))))
> (distinct? "Programming")
#f
> (distinct? "Praxis")
#t
We used R6RS hash tables in the code shown above, but at http://ideone.com/0Yw1Bd we used Guile hash tables, which differ in their calling syntax.
In Python.
def order_n(word): return len(set(word)) == len(word) def order_nln(word): chars = sorted(word) return not any(a == b for a, b in zip(chars, chars[1:])) def order_n2(word): return not any(a == b for i, a in enumerate(word) for b in word[i+1:])Added an additional method to Paul’s solution in Python. Uses pruning, combined with dictionary add/lookup is O(1) average. Has better performance, depending on distribution of frequently occuring characters amongst all different words (i.e. character ‘e’ reoccurs frequently in English).
import time def order_n(word): return len(set(word)) == len(word) def order_nln(word): chars = sorted(word) return not any(a == b for a, b in zip(chars, chars[1:])) def order_n2(word): return not any(a == b for i, a in enumerate(word) for b in word[i+1:]) def order_n_prune(word): d = {} for c in word: if c in d: return False else: d[c] = 1 return True def test_performance(list_of_words, func): start_time = time.time() distinct = 0 for word in list_of_words: if func(word): distinct += 1 print(func.__name__, distinct, len(list_of_words)) print("Took", time.time() - start_time, "seconds.") list_of_words = [] words_file = '/usr/share/dict/words' for line in open(words_file): word = line.strip() list_of_words.append(word) test_performance(list_of_words, order_n) test_performance(list_of_words, order_nln) test_performance(list_of_words, order_n2) test_performance(list_of_words, order_n_prune) # RESULTS # python unique.py # ('order_n', 99419, 479828) # Took 0.4857931137084961 seconds. # ('order_nln', 99419, 479828) # Took 1.2898380756378174 seconds. # ('order_n2', 99419, 479828) # Took 1.4195671081542969 seconds. # ('order_n_prune', 99419, 479828) # Took 0.416719913482666 seconds.Rutger: I expect the difference between the set object and your emulation of it is just noise.
John: order_n_prune is rougly 10-20% faster than order_n for the words in /usr/share/dict/words (linux). For other datasets this may not hold, i.e. a dataset with words that only have unique characters.
Klong: Not sure how to provide the 3 different solutions, so here’s the quickest one
str::”Programming Praxis”
“Programming Praxis”
str=?str
0
str::”Praxis”
“Praxis”
str=?str
1
:” ?a returns a list containing unique elements from ‘a’ in order of appearance.”
[sourcecode lang="css"]
Not sure how to do the 3 solutions, so here’s just one:
DISTCHAR (str) ; Check for distinct characters
n arr,flg,i
s flg=1
f i=1:1:$l(str) d q:’flg
. s char=$e(str,i)
. i $d(arr(char)) s flg=0
. e s arr(char)=””
q flg
MCL> W $$DISTCHAR^DISTCHAR(“Programming Praxis”)
0
MCL> W $$DISTCHAR^DISTCHAR(“Praxis”)
1
Just the fastest solution (O[n] time) in C11:
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
bool
distinct (char *str);
int
main (int argc, char **argv)
{
char *strings[] = {"Programming", "Praxis", "Cat", "Dog", "Dogg", NULL};
for (register size_t i = 0; strings[i] != NULL; ++i)
{
if (distinct(strings[i]))
printf("'%s' is distinct\n", strings[i]);
else
printf("'%s' is not distinct\n", strings[i]);
}
exit(0);
}
// O(n) solution
bool
distinct (char *str)
{
size_t arr[127] = {0}; // ASCII
const size_t n = strlen(str);
for (register size_t i = 0; i < n; ++i)
if (++arr[str[i]] > 1)
return false;
return true;
}