# from transformers import AutoTokenizer, AutoModelForMaskedLM # from transformers import pipeline # import random # from nltk.corpus import stopwords # import math # # Masking Model # def mask_non_stopword(sentence): # stop_words = set(stopwords.words('english')) # words = sentence.split() # non_stop_words = [word for word in words if word.lower() not in stop_words] # if not non_stop_words: # return sentence # word_to_mask = random.choice(non_stop_words) # masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) # return masked_sentence # def mask_non_stopword_pseudorandom(sentence): # stop_words = set(stopwords.words('english')) # words = sentence.split() # non_stop_words = [word for word in words if word.lower() not in stop_words] # if not non_stop_words: # return sentence # random.seed(10) # word_to_mask = random.choice(non_stop_words) # masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) # return masked_sentence # def high_entropy_words(sentence, non_melting_points): # stop_words = set(stopwords.words('english')) # words = sentence.split() # non_melting_words = set() # for _, point in non_melting_points: # non_melting_words.update(point.lower().split()) # candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words] # if not candidate_words: # return sentence # max_entropy = -float('inf') # max_entropy_word = None # for word in candidate_words: # masked_sentence = sentence.replace(word, '[MASK]', 1) # predictions = fill_mask(masked_sentence) # # Calculate entropy based on top 5 predictions # entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5]) # if entropy > max_entropy: # max_entropy = entropy # max_entropy_word = word # return sentence.replace(max_entropy_word, '[MASK]', 1) # # Load tokenizer and model for masked language model # tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking") # model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") # fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer) from transformers import AutoTokenizer, AutoModelForMaskedLM from transformers import pipeline import random from nltk.corpus import stopwords import math # Masking Model def mask_non_stopword(sentence): stop_words = set(stopwords.words('english')) words = sentence.split() non_stop_words = [word for word in words if word.lower() not in stop_words] if not non_stop_words: return sentence, None, None word_to_mask = random.choice(non_stop_words) masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) predictions = fill_mask(masked_sentence) words = [pred['score'] for pred in predictions] logits = [pred['token_str'] for pred in predictions] return masked_sentence, words, logits def mask_non_stopword_pseudorandom(sentence): stop_words = set(stopwords.words('english')) words = sentence.split() non_stop_words = [word for word in words if word.lower() not in stop_words] if not non_stop_words: return sentence, None, None random.seed(10) word_to_mask = random.choice(non_stop_words) masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) predictions = fill_mask(masked_sentence) words = [pred['score'] for pred in predictions] logits = [pred['token_str'] for pred in predictions] return masked_sentence, words, logits def high_entropy_words(sentence, non_melting_points): stop_words = set(stopwords.words('english')) words = sentence.split() non_melting_words = set() for _, point in non_melting_points: non_melting_words.update(point.lower().split()) candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words] if not candidate_words: return sentence, None, None max_entropy = -float('inf') max_entropy_word = None max_logits = None for word in candidate_words: masked_sentence = sentence.replace(word, '[MASK]', 1) predictions = fill_mask(masked_sentence) # Calculate entropy based on top 5 predictions entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5]) if entropy > max_entropy: max_entropy = entropy max_entropy_word = word max_logits = [pred['score'] for pred in predictions] masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1) words = [pred['score'] for pred in predictions] logits = [pred['token_str'] for pred in predictions] return masked_sentence, words, logits # Load tokenizer and model for masked language model tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking") model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer) non_melting_points = [(1, 'Jewish'), (2, 'messages'), (3, 'stab')] a, b, c = high_entropy_words("A former Cornell University student was sentenced to 21 months in prison on Monday after admitting that he had posted a series of online messages last fall in which he threatened to stab, rape and behead Jewish people", non_melting_points) print(f"logits type: {type(b)}") print(f"logits content: {b}")