Natural Language Processing

Natural Language Processing (NLP) is the interdisciplinary field that develops computational methods to process, analyze, understand, and generate human language:

Using computers to study human language, e.g. measuring language change over time, detecting dialectal variation, evaluating grammatical theories, etc.
Enabling computers to deal with language, e.g. computer translation, summarization, information extraction, comment moderation, etc.

import math
from collections import Counter
from itertools import combinations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.patches as mpatches
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
import seaborn as sns

from pathlib import Path
import requests

import fitz
import arxiv, time, requests, os
import pyreadr

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

import spacy
from spacy.cli import download
download("en_core_web_sm")

from datasets import load_dataset

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchviz import make_dot
from torchview import draw_graph

from gensim.models import Word2Vec
import gensim.downloader as api
w2v_pretrained_model = api.load('word2vec-google-news-300')

import editdistance

from PIL import Image
from io import BytesIO
import easyocr

import unicodedata
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sentence_transformers import SentenceTransformer

from bs4 import BeautifulSoup

from tokenizers import Tokenizer
from tokenizers import decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

Collecting en-core-web-sm==3.8.0

  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/12.8 MB ? eta -:--:--

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 93.3 MB/s  0:00:00


✔ Download and installation successful

You can now load the package via spacy.load('en_core_web_sm')

⚠ Restart to reload dependencies

If you are in a Jupyter or Colab notebook, you may need to restart Python in

order to load all the package's dependencies. You can do this by selecting the

'Restart kernel' or 'Restart runtime' option.

Outline

Introduction to handling text data
Intuition about “AI”
Practical applications for social sciences

We are going to use mostly two corpora of text:

Economics pre-print papers from arXiv
US presidents inaugural speeches

query = "cat:econ.GN"  
target_n = 50
out_dir = "arxiv_papers"
os.makedirs(out_dir, exist_ok=True)

search = arxiv.Search(query=query, max_results=None, sort_by=arxiv.SortCriterion.SubmittedDate)

downloaded = 0 # 0
for result in search.results():
    if downloaded >= target_n:
        break

    arxiv_id = result.get_short_id()
    pdf_url = result.pdf_url
    filename = os.path.join(out_dir, f"{arxiv_id}.pdf")

    if not os.path.exists(filename):
        r = requests.get(pdf_url, stream=True)
        if r.status_code == 200:
            with open(filename, "wb") as f:
                for chunk in r.iter_content(chunk_size=1024*1024):
                    if chunk:
                        f.write(chunk)
            downloaded += 1
    time.sleep(3)

papers = {}
for fname in os.listdir(out_dir):
    if fname.endswith(".pdf"):
        fpath = os.path.join(out_dir, fname)
        with open(fpath, "rb") as f:
            papers[fname] = f.read()

/tmp/ipykernel_10194/4149224177.py:9: DeprecationWarning: The 'Search.results' method is deprecated, use 'Client.results' instead
  for result in search.results():

fname, papers[fname][0:500] # pdf is not raw text!  It's a layout format: it stores instructions like "draw glyph 'A' at coordinates (72, 540)".

('2603.21874v1.pdf',
 b'%PDF-1.7\n%\xbf\xf7\xa2\xfe\n1 0 obj\n<< /Metadata 3 0 R /Names 4 0 R /OpenAction 5 0 R /Outlines 6 0 R /PageMode /UseOutlines /Pages 7 0 R /Type /Catalog >>\nendobj\n2 0 obj\n<< /Author (Ian Crawford; Carl-Emil Pless) /Creator (arXiv GenPDF \\(tex2pdf:a6404ea\\)) /DOI (https://doi.org/10.48550/arXiv.2603.21874) /License (http://arxiv.org/licenses/nonexclusive-distrib/1.0/) /PTEX.Fullbanner (This is pdfTeX, Version 3.141592653-2.6-1.40.28 \\(TeX Live 2025\\) kpathsea version 6.4.1) /Producer (pikepdf 8.15.1) /Title (')

nltk.download('inaugural')

from nltk.corpus import inaugural

us = pd.DataFrame([
    {
        "year": int(parts[0]),
        "speaker": parts[1],
        "text": inaugural.raw(fid),
    }
    for fid in inaugural.fileids()
    for parts in [fid.replace('.txt', '').split('-', 1)]
])

[nltk_data] Downloading package inaugural to /home/onyxia/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!

us[0:20]

	year	speaker	text
0	1789	Washington	Fellow-Citizens of the Senate and of the House...
1	1793	Washington	Fellow citizens, I am again called upon by the...
2	1797	Adams	When it was first perceived, in early times, t...
3	1801	Jefferson	Friends and Fellow Citizens:\n\nCalled upon to...
4	1805	Jefferson	Proceeding, fellow citizens, to that qualifica...
5	1809	Madison	Unwilling to depart from examples of the most ...
6	1813	Madison	About to add the solemnity of an oath to the o...
7	1817	Monroe	I should be destitute of feeling if I was not ...
8	1821	Monroe	Fellow citizens, I shall not attempt to descri...
9	1825	Adams	In compliance with an usage coeval with the ex...
10	1829	Jackson	Fellow citizens, about to undertake the arduou...
11	1833	Jackson	Fellow citizens, the will of the American peop...
12	1837	VanBuren	Fellow citizens: The practice of all my predec...
13	1841	Harrison	Called from a retirement which I had supposed ...
14	1845	Polk	Fellow citizens, without solicitation on my pa...
15	1849	Taylor	Elected by the American people to the highest ...
16	1853	Pierce	My Countrymen, It a relief to feel that no hea...
17	1857	Buchanan	Fellow citizens, I appear before you this day ...
18	1861	Lincoln	Fellow-Citizens of the United States: In compl...
19	1865	Lincoln	Fellow-Countrymen:\n\nAt this second appearing...

Making raw data useable

Text data come in a variety of formats, e.g. PDF, images (scans), JSON, XML…

There is usually a lot of work needed to make raw data useable.

Some examples:

PDF is a layout format, describing where different elements should go. It contains both text, images, and metadata.

papers_text = {fname: "\n".join(page.get_text() for page in fitz.open(stream=content, filetype="pdf")) for fname, content in papers.items()}

print((papers[fname][:100]), '\n\n', papers_text[fname][:100])

b'%PDF-1.7\n%\xbf\xf7\xa2\xfe\n1 0 obj\n<< /Metadata 3 0 R /Names 4 0 R /OpenAction 5 0 R /Outlines 6 0 R /PageMode /' 

 Does Anxiety Improve Economic
Decision-Making?
Ian Crawford∗
Carl-Emil Pless†
March 24, 2026
We stud

Web scrapping (see previous classes)

url = "https://www.theguardian.com"
headers = {"User-Agent": "Mozilla/5.0 (educational-scraper)"}

resp = requests.get(url, headers=headers, timeout=10)
resp.raise_for_status()

# Common pitfall: declared vs actual encoding
print(f"Declared encoding (headers): {resp.encoding}")
print(f"Detected encoding (content): {resp.apparent_encoding}")

# Force the correct encoding if needed
resp.encoding = resp.apparent_encoding

soup = BeautifulSoup(resp.text, "html.parser")

# Extract article headlines
headlines = [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])]
for h in headlines[0:10]:
    print(f"{h[:80]}")

Declared encoding (headers): UTF-8
Detected encoding (content): utf-8
Paul Taylordouble quotation markA crowded field could gift French election to fa
InterviewPreston looks back at a tumultuous career
InterviewColombia’s VP blames racism for years of frustration
Today in FocusWhen the ‘Dubai dream’ goes wrong
TelevisionGrayson Perry's insights into AI are mindblowing
Analysisdouble quotation markIs this how you win things? Arsenal hope so
News
Middle East crisis liveNetanyahu ‘to speak to Lebanese leader leader today’ but 
Middle East crisisUS and Iran in indirect talks to extend two-week ceasefire
AnalysisTrump needs a better Iran deal than Obama’s – but faces major hurdles

Optical Character Recognition (OCR) converts images of printed text to machine-readable strings. Handwritten Text Recognition (HTR) handles manuscripts and handwritten records.

Quality is measured by two metrics:

Character Error Rate (CER): fraction of characters incorrectly transcribed
Word Error Rate (WER): fraction of words containing at least one error

image_url = "https://upload.wikimedia.org/wikipedia/commons/d/dd/The_universal_declaration_of_human_rights_10_December_1948.jpg"

headers = {
    "User-Agent": "MyOCRScript/1.0 (https://test.com/; <mailto:contact@test.com>)",
}

session = requests.Session()
resp = session.get(image_url, headers=headers, timeout=10)

resp.raise_for_status()  # will raise if still a 4xx/5xx

img = Image.open(BytesIO(resp.content))
img_array = np.array(img)
print(f"Shape: {img_array.shape}, dtype: {img_array.dtype}")

Shape: (2698, 2000, 3), dtype: uint8

A raster image is an array of Red, Green, Blue values (see previous class on raster data for geography).

reader = easyocr.Reader(['en']) 
result = reader.readtext(img)
text = ' '.join([detection[1] for detection in result])
text[0:100]

'THE UUNIVERSAL DECLARATION OF Human WIIEREAS recognition of the inherent dignity and of the equal an'

fig, (ax_img, ax_txt) = plt.subplots(1, 2)

ax_img.imshow(img)
ax_img.axis("off")
ax_img.set_title("Original Image")

ax_txt.text(
    0.05, 0.95, text[0:200] + "...",
    transform=ax_txt.transAxes,
    fontsize=10,
    verticalalignment="top",
    family="monospace",
    wrap=True,
)
ax_txt.axis("off")
ax_txt.set_title("OCR Output")

plt.tight_layout()
plt.show()

def extract_udhr_text(html: str) -> str:
    html = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL)
    html = re.sub(r"<style[^>]*>.*?</style>",  "", html, flags=re.DOTALL)
    text = re.sub(r"<[^>]+>", " ", html)
    text = re.sub(r"\s+", " ", text).strip()
    return unicodedata.normalize("NFC", text)


resp = session.get(
    "https://en.wikisource.org/wiki/Universal_Declaration_of_Human_Rights",
    headers=headers, timeout=10
    )

resp.raise_for_status()

ground_truth = extract_udhr_text(resp.text)

ground_truth[0:100]

'Universal Declaration of Human Rights - Wikisource, the free online library Jump to content Main men'

ground_truth_split = ground_truth.split()
text_split = text.split()

editdistance.eval(ground_truth_split, text_split) / len(ground_truth_split)

0.8840328861964517

Encodings

Computers store data as 0s & 1s.

Character encodings map 0s/1s to actual characters.

There are many different encodings; you should almost always convert to Unicode normalisation if your corpora span languages or historical periods, or if you pool documents from different sources which may have used different encodings.

Failing to normalise before tokenisation produces duplicate tokens, incorrect frequency counts, and silent data loss.

print("u\u0308")

ü

"ü" == "u\u0308"

False

unicodedata.normalize("NFC", "u\u0308") == "ü"

True

Some basic processing tasks & tools

Regular expressions (REGEX). A regex is a pattern that describes a set of strings. The engine scans input left-to-right, attempting to match the pattern at each position, e.g. to match email adresses, we might want a substring that fits the following pattern:
- any number of alphanumeric characters and ., _, + or -
- followed by @
- followed by the domain name (alphanumeric character, dots or hyphen)
- followed by a dot .
- followed by the TLD (at least two letters)

Note: this does not actually catches all valid email adresses.

email_pattern = re.compile(
    r"""
    (?P<local>     [a-zA-Z0-9._+-]+)     # local part
    @                                    # literal @
    (?P<domain>     [a-zA-Z0-9.-]+)      # domain
    \.                                   # literal dot
    (?P<tld>        [a-zA-Z]{2,})        # TLD
    """,
    re.VERBOSE
)

for fname, text in list(papers_text.items())[:20]:
    matches = email_pattern.finditer(text or "")
    emails = [m.group(0) for m in matches]
    print(f"{fname} {emails}")

2604.01933v1.pdf ['szb0288@auburn.edu', 'jnunley@uwlax.edu', 'alan.seals@auburn.edu', 'mingzhou.wang@uga.edu']
2604.01364v1.pdf ['cespinal@eafit.edu.co']
2603.07893v2.pdf ['caitken@uchicago.edu', 'kremermr@uchicago.edu']
2603.12301v1.pdf ['wesley@topos.institute']
2603.15832v1.pdf ['zy.kang@utoronto.ca']
2604.00874v1.pdf ['cashman@mit.edu']
2604.10570v1.pdf ['20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn', '20250612@hhu.edu.cn']
2603.15700v1.pdf ['nandini.maroo@research.iiit.ac.in', 'kavita.vemuri@iiit.ac.in']
2603.12129v1.pdf []
2603.27724v1.pdf ['ertian.chen.19@ucl.ac.uk', 'lichao.chen.17@ucl.ac.uk', 'l.nesheim@ucl.ac.uk']
2603.12128v2.pdf []
2604.11384v1.pdf ['rok.spruk@ef.uni-lj.si']
2604.02875v1.pdf []
2603.26853v1.pdf []
2604.13998v1.pdf ['lev.razumovskiy@ramax.com', 'nikolay.karenin@ramax.com', 'msafro@ramax.com']
2603.29070v2.pdf ['sandro.ambuehl@econ.uzh.ch', 'rbhui@mit.edu', 'heidi.thysen@nhh.no']
2603.09637v1.pdf ['yamaei@seinan-gu.ac.jp', 'ohtake@cider.osaka-u.ac.jp']
2603.16006v1.pdf ['indrefjorden@pm.me']
2603.08603v1.pdf []
2603.21895v1.pdf ['devetak@csh.ac.at', 'antoine.mandel@univ-paris1.fr']

Tokenization: splitting text into units. Three levels of granularity:
- Whitespace / rule-based (spaCy, NLTK): splits on spaces and punctuation.
- Subword / BPE (Byte Pair Encoding): builds a vocabulary by iteratively merging the most frequent character pairs.
Normalization:
- Lemmatization (“organizing” -> “organize”)
- Stemming (“organizing” -> “organ”)
- Stopword removal

nltk.download("punkt_tab", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("stopwords", quiet=True)

True

jfk = us.loc[43, "text"]

jfk.split()[0:20]

['Vice',
 'President',
 'Johnson,',
 'Mr.',
 'Speaker,',
 'Mr.',
 'Chief',
 'Justice,',
 'President',
 'Eisenhower,',
 'Vice',
 'President',
 'Nixon,',
 'President',
 'Truman,',
 'reverend',
 'clergy,',
 'fellow',
 'citizens,',
 'we']

nltk_tokens = nltk.word_tokenize(jfk)
nltk_tokens[0:20]

['Vice',
 'President',
 'Johnson',
 ',',
 'Mr.',
 'Speaker',
 ',',
 'Mr.',
 'Chief',
 'Justice',
 ',',
 'President',
 'Eisenhower',
 ',',
 'Vice',
 'President',
 'Nixon',
 ',',
 'President',
 'Truman']

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

for w in nltk_tokens[150:175]:
    is_stop = "✓" if w.lower() in stop_words else ""
    print(f"{w:<18} {stemmer.stem(w):<15} {lemmatizer.lemmatize(w, pos='v'):<15} {is_stop}")

,                  ,               ,               
but                but             but             ✓
from               from            from            ✓
the                the             the             ✓
hand               hand            hand            
of                 of              of              ✓
God                god             God             
.                  .               .               
We                 we              We              ✓
dare               dare            dare            
not                not             not             ✓
forget             forget          forget          
today              today           today           
that               that            that            ✓
we                 we              we              ✓
are                are             be              ✓
the                the             the             ✓
heirs              heir            heirs           
of                 of              of              ✓
that               that            that            ✓
first              first           first           
revolution         revolut         revolution      
.                  .               .               
Let                let             Let             
the                the             the             ✓

Application : Zipf’s law

Word frequency follows a power law : \[f(r) \propto \frac{1}{r^s}\]

where \(r\) is the frequency rank and \(s \approx 1\). The most frequent word is roughly twice as frequent as the second most frequent, three times the third, and so on.

Note this means there is a massive long tail: most vocabulary items are rare.

tokens = nltk.word_tokenize(' '.join(papers_text.values()).lower())

tokens = [
    lemmatizer.lemmatize(w, pos='v')
    for w in tokens
]

freq = Counter(tokens)
ranks = np.arange(1, len(freq) + 1)
counts = np.array([c for _, c in freq.most_common()])

log_ranks = np.log(ranks)
log_counts = np.log(counts)
slope, intercept = np.polyfit(log_ranks, log_counts, 1)

fig, ax = plt.subplots(figsize=(9, 5))
ax.scatter(log_ranks, log_counts, s=4, alpha=0.4, label='Observed')
ax.plot(log_ranks, slope * log_ranks + intercept, color='crimson', lw=1.5,
        label=f'OLS fit  slope={slope:.2f}')

ax.set_xlabel('log(rank)')
ax.set_ylabel('log(frequency)')
ax.set_title("Zipf's Law")
ax.legend()
plt.tight_layout()
plt.show()

We may need to represent texts as some kind of numerical data structure that would be easier to work with. Examples include:

Document term matrix: 1 entry per token, value is number of times the word appear in a given document. We lose word order.

vectorizer = CountVectorizer(stop_words="english", min_df=1)
dtm = vectorizer.fit_transform(us["text"])

df_dtm = pd.DataFrame(
    dtm.toarray(),
    columns=vectorizer.get_feature_names_out(),
    index=[f"doc_{i}" for i in range(len(us))],
)
print(f"Matrix: {dtm.shape[0]} speeches × {dtm.shape[1]} tokens")
print(f"Sparsity: {1 - dtm.nnz / (dtm.shape[0] * dtm.shape[1]):.1%}\n")
df_dtm

Matrix: 60 speeches × 9139 tokens
Sparsity: 93.1%

	000	100	108	11	120	125	13	14th	15th	16	...	young	younger	youngest	youth	youthful	youâ	zeal	zealous	zealously	zone
doc_0	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	0
doc_3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	0
doc_4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	3	0	0	0
doc_5	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
doc_6	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_7	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	1	1	0	1
doc_8	5	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	1	1	0
doc_9	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	0
doc_10	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	1	0	0
doc_11	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_12	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
doc_13	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
doc_14	0	0	0	0	0	0	0	0	0	0	...	0	1	0	0	0	0	0	0	0	0
doc_15	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
doc_16	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_17	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_18	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_19	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_20	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
doc_21	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_22	2	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
doc_23	2	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_24	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	0
doc_25	0	0	0	0	0	0	0	0	0	0	...	2	0	0	0	0	0	0	0	0	0
doc_26	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_27	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	3	1	0	0
doc_28	2	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_29	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_30	2	1	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
doc_31	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_32	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_33	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_34	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_35	0	0	0	0	0	0	0	0	0	0	...	0	0	0	1	0	0	0	0	1	0
doc_36	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
doc_37	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_38	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_39	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_40	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
doc_41	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_42	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_43	1	1	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
doc_44	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
doc_45	0	0	0	0	0	0	0	0	0	0	...	0	0	0	2	0	0	0	0	0	0
doc_46	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
doc_47	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
doc_48	0	0	0	0	0	0	0	0	0	0	...	2	0	0	0	0	0	0	0	0	0
doc_49	0	0	0	0	0	0	1	0	0	0	...	2	0	0	1	0	0	0	0	0	0
doc_50	0	0	0	0	0	0	0	0	0	0	...	4	0	0	0	0	0	0	0	0	0
doc_51	0	0	0	0	0	0	0	0	0	0	...	2	0	0	0	0	0	0	0	0	0
doc_52	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
doc_53	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
doc_54	0	0	0	0	0	0	0	0	0	0	...	1	0	1	0	0	0	0	0	0	0
doc_55	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
doc_56	0	0	0	0	0	0	0	0	0	0	...	1	0	0	1	0	0	0	0	0	0
doc_57	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
doc_58	1	0	1	1	0	0	0	0	0	0	...	0	0	0	0	0	1	0	0	0	0
doc_59	1	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	1	0	0	0	0

60 rows × 9139 columns

TF-IDF: Term Frequency–Inverse Document Frequency is a numerical statistic that reflects how characteristic a word is of a particular document within a corpus. TF captures how often a term appears in a document (more frequent in a document = more important, probably), while IDF penalizes terms that appear in many documents (appears everywhere, e.g. stop words, are not informative).

\[\text{TF-IDF}(t, d) = f_{t,d} \cdot \log\frac{N}{|\{d' : t \in d'\}|}\]

where \(f_{t,d}\) is the raw term count in document \(d\), \(N\) is the total number of documents, and the denominator counts documents containing \(t\).

This gives us a sparse vector reprezentation of documents - this means we turned a list of tokens into a number (vector).

vectorizer = TfidfVectorizer(
    max_features=500, stop_words="english", min_df=1, ngram_range=(1, 2),
)
tfidf_matrix = vectorizer.fit_transform(us["text"])
feature_names = vectorizer.get_feature_names_out()

print(f"TF-IDF matrix: {tfidf_matrix.shape[0]} docs × {tfidf_matrix.shape[1]} features\n")

for i, row in us.head(10).iterrows():
    scores = tfidf_matrix[i].toarray().flatten()
    top_idx = scores.argsort()[-5:][::-1]
    top_terms = [(feature_names[j], f"{scores[j]:.3f}") for j in top_idx]
    print(f"  {row['speaker']} ({row['year']}): {top_terms}")

TF-IDF matrix: 60 docs × 500 features

  Washington (1789): [('government', '0.242'), ('ought', '0.222'), ('public', '0.215'), ('present', '0.209'), ('measures', '0.183')]
  Washington (1793): [('shall', '0.357'), ('oath', '0.319'), ('distinguished', '0.228'), ('endeavor', '0.222'), ('presence', '0.203')]
  Adams (1797): [('people', '0.319'), ('government', '0.273'), ('nations', '0.197'), ('foreign', '0.196'), ('constitution', '0.176')]
  Jefferson (1801): [('government', '0.263'), ('principle', '0.221'), ('let', '0.194'), ('man', '0.182'), ('fellow citizens', '0.181')]
  Jefferson (1805): [('public', '0.318'), ('state', '0.213'), ('limits', '0.194'), ('citizens', '0.189'), ('fellow citizens', '0.182')]
  Madison (1809): [('public', '0.263'), ('nations', '0.234'), ('rights', '0.164'), ('states', '0.164'), ('peace', '0.164')]
  Madison (1813): [('war', '0.548'), ('honorable', '0.184'), ('united', '0.174'), ('country', '0.157'), ('spirit', '0.154')]
  Monroe (1817): [('states', '0.263'), ('government', '0.250'), ('great', '0.227'), ('united states', '0.178'), ('union', '0.166')]
  Monroe (1821): [('great', '0.269'), ('states', '0.215'), ('united states', '0.188'), ('war', '0.178'), ('revenue', '0.177')]
  Adams (1825): [('union', '0.404'), ('government', '0.235'), ('general', '0.174'), ('rights', '0.152'), ('public', '0.147')]

Cosine similarity is \[\cos(\theta) = \frac{\mathbf{a} \cdot \mathbf{b}}{\|\mathbf{a}\| \|\mathbf{b}\|} = \frac{\sum_{i} a_i b_i}{\sqrt{\sum_{i} a_i^2} \cdot \sqrt{\sum_{i} b_i^2}}\]

Measures angle between vectors, regardless of vector lengths (i.e. document magnitude does not change the results).

sim_matrix = cosine_similarity(tfidf_matrix)

fig, ax = plt.subplots(figsize=(8, 6))
labels = [f"{r['speaker']}\n({r['year']})" for _, r in us.iterrows()]
sns.heatmap(sim_matrix, xticklabels=labels, yticklabels=labels,
            annot=False, fmt=".2f", cmap="YlOrRd", ax=ax)
ax.set_title("Cosine Similarity (TF-IDF)")
plt.xticks(rotation=45, ha="right", fontsize=7)
plt.yticks(fontsize=7)
plt.tight_layout()
plt.show()

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df["year"] = us["year"].values
yearly = tfidf_df.groupby("year").mean()

top20 = tfidf_df.drop(columns="year").mean().nlargest(5).index
yearly[top20].plot(figsize=(14, 6), legend=True)

plt.title("TF-IDF score evolution — top 5 terms")
plt.xlabel("Year")
plt.ylabel("Mean TF-IDF")
plt.legend(bbox_to_anchor=(1.01, 1), loc="upper left", fontsize=8)
plt.tight_layout()
plt.show()

N-grams

bigram_vec = CountVectorizer(ngram_range=(2, 2), stop_words="english")
bigrams = bigram_vec.fit_transform(us["text"])

print("Most frequent bigrams:")
bg_freq = pd.Series(
    bigrams.toarray().sum(axis=0),
    index=bigram_vec.get_feature_names_out(),
).sort_values(ascending=False)

for bg, count in bg_freq.head(10).items():
    print(f"  '{bg}' : {count}")

Most frequent bigrams:
  'united states' : 166
  'fellow citizens' : 118
  'american people' : 41
  'federal government' : 34
  'self government' : 30
  'men women' : 29
  'years ago' : 28
  'general government' : 25
  'vice president' : 21
  'constitution united' : 20

nlp = spacy.load("en_core_web_sm")

def build_cooccurrence(documents, window):
    cooc = Counter()
    for doc_tokens in documents:
        for i, token in enumerate(doc_tokens):
            context = doc_tokens[max(0, i - window):i + window + 1]
            for other in context:
                if other != token:
                    pair = tuple(sorted([token, other]))
                    cooc[pair] += 1
    return cooc

tokenized_docs = []
for d in us["text"]:
    doc = nlp(d)
    tokenized_docs.append([
        t.lemma_.lower() for t in doc
        if not t.is_stop and not t.is_punct and len(t.text) > 2
    ])

cooc = build_cooccurrence(tokenized_docs, window=3)

for pair, count in cooc.most_common(10):
    print(f"  {pair[0]:20s} — {pair[1]:20s} : {count}")

  states               — united               : 346
  citizen              — fellow               : 236
  government           — people               : 148
  great                — nation               : 106
  american             — people               : 104
  peace                — world                : 94
  nation               — world                : 90
  people               — states               : 82
  government           — states               : 82
  nation               — people               : 82

Pointwise Mutual Information:

\[PMI(x, y) = log_{2}\left( \frac{P(x,y) }{ P(x) \times P(y)} \right)\]

total_cooc = sum(cooc.values())
word_freq = Counter()
for doc_tokens in tokenized_docs:
    word_freq.update(doc_tokens)
total_words = sum(word_freq.values())

print("PMI for the most frequent pairs:")
for pair, count in cooc.most_common(10):
    p_xy = count / total_cooc
    p_x = word_freq[pair[0]] / total_words
    p_y = word_freq[pair[1]] / total_words
    if p_x > 0 and p_y > 0:
        pmi = np.log2(p_xy / (p_x * p_y))
        print(f"  {pair[0]:15s} — {pair[1]:15s} : PMI = {pmi:+.2f}")

PMI for the most frequent pairs:
  states          — united          : PMI = +5.73
  citizen         — fellow          : PMI = +5.59
  government      — people          : PMI = +1.81
  great           — nation          : PMI = +2.12
  american        — people          : PMI = +3.12
  peace           — world           : PMI = +3.34
  nation          — world           : PMI = +2.24
  people          — states          : PMI = +1.99
  government      — states          : PMI = +1.94
  nation          — people          : PMI = +1.26

A TF-IDF based search engine: return paper with the closest TF-IDF score as measured by cosine similarity

def lemmatize(text):
    return " ".join(t.lemma_ for t in nlp(text, disable=["ner", "parser"]) if not t.is_space)

def lemmatize_batch(texts):
    return [
        " ".join(t.lemma_ for t in doc if not t.is_space)
        for doc in nlp.pipe(texts, batch_size=50, disable=["ner", "parser"])
    ]

ids = list(papers_text.keys())
corpus = lemmatize_batch([papers_text[i] for i in ids])

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
vocab = set(vectorizer.vocabulary_)

def query(q, k):
    q_lemmas = lemmatize(q).split()
    q_vec = vectorizer.transform([" ".join(q_lemmas)])
    scores = cosine_similarity(q_vec, tfidf_matrix).flatten()
    top_k = np.argsort(scores)[::-1][:k]
    return [(ids[i], round(float(scores[i]), 4)) for i in top_k]

query("Sustainable Impact Analysis", 5)

[('2604.13150v1.pdf', 0.0867),
 ('2603.26702v1.pdf', 0.0498),
 ('2603.20674v2.pdf', 0.0474),
 ('2604.12991v1.pdf', 0.0398),
 ('2603.21815v2.pdf', 0.0381)]

i = ids[0]
results = query(papers_text[i], 5)
results

[('2604.01933v1.pdf', 1.0),
 ('2603.29121v1.pdf', 0.571),
 ('2603.08956v5.pdf', 0.5599),
 ('2604.04464v1.pdf', 0.5592),
 ('2604.01363v1.pdf', 0.55)]

print(papers_text[i][:500] + "...")

Hiring Discrimination
and the Task Content of Jobs:
Evidence from a Large-Scale R´esum´e Audit ∗
Sharon Braun†, Jonathan Bushnell, Zachary Cowell, David Dowling
Samuel Goldstein, Andrew Johnson, George Miller, John M. Nunley‡
R. Alan Seals§, and Mingzhou Wang¶
April 3, 2026
Abstract
We conducted a large-scale r´esum´e audit of 36,880 applications to 9,220 job advertisements
for new college graduates across the United States. Firms express task preferences through
job-advertisement text, which we...

print(papers_text[results[1][0]][0:500] + "...")

Economics of Human and AI Collaboration:
When is Partial Automation More Attractive than Full
Automation?
Wensu Li𝑎
Atin Aboutorabi𝑏
Harry Lyu𝑎
Kaizhi Qian𝑐
Martin Fleming𝑎
Brian C. Goehring𝑑
Neil Thompson𝑎∗
𝑎Massachusetts Institute of Technology
𝑏´Ecole Polytechnique F´ed´erale de Lausanne
𝑐IBM Research
𝑑IBM’s Institute for Business Value
{wensu, hlyu, marti264, neil t}@mit.edu
atin.aboutorabi@epfl.ch
kqian@ibm.com
goehring@us.ibm.com
Abstract
This paper develops a unified framework for evaluat...

Latent Dirichlet Allocation (LDA) is model that assumes each document is a mixture of latent “topics”, and each topic is a distribution over words. Given a corpus, it infers what the topics might be, and what the proportion of topics might be in each document.

tfidf_lda = TfidfVectorizer(stop_words="english", max_features=50)
X_lda = tfidf_lda.fit_transform(us['text'])

lda_model = LatentDirichletAllocation(
    n_components = 3, # up to you to choose
    random_state = 42,
    max_iter = 30,
)
doc_topics = lda_model.fit_transform(X_lda)

feature_names = tfidf_lda.get_feature_names_out()

for topic_idx, topic in enumerate(lda_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-8:-1]]
    print(f"{topic_idx}: {', '.join(top_words)}")

print('\n\n')

for i, d in enumerate(us['text']):
    if i % 5 != 0:
        continue
    dominant = doc_topics[i].argmax()
    conf = doc_topics[i].max()
    print(f"  {us['speaker'][i]} [{dominant}] ({conf:.0%})")

0: government, people, states, public, shall, country, union
1: constitution, god, know, work, war, union, interests
2: america, world, nation, new, people, freedom, today



  Washington [0] (87%)
  Madison [0] (87%)
  Jackson [0] (88%)
  Taylor [0] (86%)
  Grant [0] (86%)
  Harrison [0] (87%)
  Taft [0] (76%)
  Hoover [0] (53%)
  Truman [2] (85%)
  Nixon [2] (87%)
  Bush [2] (86%)
  Obama [2] (87%)

topic_cols = [f"topic_{k}" for k in range(doc_topics.shape[1])]
df_topics = pd.DataFrame(doc_topics, columns=topic_cols)
df_topics['year'] = us['year'].values

topic_by_year = df_topics.groupby('year')[topic_cols].mean()

topic_by_year.plot(kind='area', stacked=True, figsize=(12, 5), colormap='tab10')
plt.ylabel("Mean topic share")
plt.title("Topic distribution over time")
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()

paper_ids = list(papers_text.keys())
paper_texts = list(papers_text.values())

tfidf_lda = TfidfVectorizer(stop_words="english")
X = tfidf_lda.fit_transform(paper_texts)

kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)

pca = PCA(n_components=2, random_state=42)
coords = pca.fit_transform(X.toarray())

fig, ax = plt.subplots(figsize=(10, 7))
colors = ["#e74c3c", "#3498db", "#2ecc71"]
for c in range(3):
    mask = clusters == c
    ax.scatter(coords[mask, 0], coords[mask, 1], c=colors[c],
               s=100, label=f"Cluster {c}", edgecolors="white", linewidth=0.5)
    for i in np.where(mask)[0]:
        ax.annotate(paper_ids[i], (coords[i, 0], coords[i, 1]),
                    fontsize=7, ha="center", va="bottom")

ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.0%} var.)")
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.0%} var.)")
ax.set_title("K-means clustering on TF-IDF representations")
ax.legend()
plt.tight_layout()
plt.show()

tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(paper_ids)-1))
coords_tsne = tsne.fit_transform(X.toarray())

fig, ax = plt.subplots(figsize=(10, 7))
for c in range(3):
    mask = clusters == c
    ax.scatter(coords_tsne[mask, 0], coords_tsne[mask, 1], c=colors[c],
               s=100, label=f"Cluster {c}", edgecolors="white", linewidth=0.5)
    for i in np.where(mask)[0]:
        ax.annotate(paper_ids[i], (coords_tsne[i, 0], coords_tsne[i, 1]),
                    fontsize=7, ha="center", va="bottom")

ax.set_xlabel("t-SNE 1")
ax.set_ylabel("t-SNE 2")
ax.set_title("K-means clustering on TF-IDF representations (t-SNE)")
ax.legend()
plt.tight_layout()
plt.show()

Intuitions about “AI”-based NLP

Embeddings

Through TF-IDF we have a meaningful vector representation of documents (better than raw words counts). Can we get a meaningful vector representation of words, one that captures their meaning?

By transposing the TF-IDF matrix, each row is a word, each column is a document

vec = TfidfVectorizer()
M = vec.fit_transform(us["text"])  # (N_docs, V)
Mt = M.T  # (V, N_docs), each row is a word vector

vocab = vec.vocabulary_  # word → index

def word_cosine(w1, w2):
    i, j = vocab[w1], vocab[w2]
    return cosine_similarity(Mt[i], Mt[j])[0, 0]

word_cosine("country", "nation"), word_cosine("country", "world"), word_cosine("country", "rampart")

(np.float64(0.5536943399571078),
 np.float64(0.41698130026172064),
 np.float64(0.18164517836699098))

But it’s quite sparse …

Mt[vocab["rampart"]].toarray().flatten()

array([0.        , 0.        , 0.01444568, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

… is based entirely on co-occurence within a document: two words that have a similar meaning but are don’t appear together will have low similarity…

word_cosine("medicaid", "disease")

np.float64(0.0)

… and quite corpus-dependent: size of vector is just number of documents (which may be huge!).

Word embeddings compress the high-dimensional, sparse TF-IDF space into dense, low-dimensional vectors where proximity encodes semantic similarity ( “You shall know a word by the company it keeps.” — J.R. Firth, 1957)
- One specific worder vector computation algorithm: Word2Vec. Given a target word, predict its context words. The hidden layer weights become the word vectors. Words in similar contexts get similar vectors.

\[P(w_O \mid w_I) = \frac{\exp(\vec{v}_{w_O}^{\prime\top} \vec{v}_{w_I})}{\sum_{w=1}^{W} \exp(\vec{v}_w^{\prime\top} \vec{v}_{w_I})}\]

dot product: un-normalized cosine similarity (cosine similarity + magnitude effects)
softmax to turn it into a probability

If you want to go from words vectors to sentence/documents vectors, you might:

Average word vectors: cheap, loses word order, surprisingly effective
Use Transformer-based models trained for meaningful sentence embeddings.

all_sentences = []
for text in us["text"]:
    words = [w.lower() for w in nltk.word_tokenize(text) if w.isalpha()]
    all_sentences.append(words)

w2v_model = Word2Vec(
    sentences=all_sentences, vector_size=50, window=5,
    min_count=1, epochs=100, seed=42,
)

print(f"Vocabulary: {len(w2v_model.wv)} words, {w2v_model.wv.vector_size}-dim\n")

Vocabulary: 9238 words, 50-dim

wv = lambda x: w2v_model.wv[x].reshape(1, -1)

wv_similarity = lambda a,b: cosine_similarity(
    wv(a),
    wv(b)
)

wv_similarity("country", "nation"), wv_similarity("country", "world"), wv_similarity("country", "rampart")

(array([[0.40341038]], dtype=float32),
 array([[0.08626413]], dtype=float32),
 array([[0.12627639]], dtype=float32))

wv("rampart")

array([[ 0.3430001 , -0.21314952, -0.26879248,  0.02889711,  0.46901292,
        -0.43728858,  0.2534325 ,  0.02975902, -0.23325758,  0.38660833,
         0.06783935, -0.7062172 ,  0.01552967,  0.3187782 , -0.4050863 ,
        -0.44634873, -0.51011324,  0.08673321, -0.82601094, -0.21443251,
         0.16752478, -0.15713313,  0.24109465, -0.3028874 , -0.62734085,
         0.19148315, -0.23922096,  0.04064028, -0.10031307, -0.19324633,
         0.16526258,  0.41425377, -0.34465447,  0.05152915,  0.23136517,
         0.21649626,  0.20062068,  0.41893396, -0.01899227, -0.44317508,
         0.15537885, -0.0823855 , -0.00721491,  0.22351976, -0.17293951,
        -0.2554784 ,  0.31995073,  0.49292347, -0.26328748, -0.1014408 ]],
      dtype=float32)

wv_similarity("medicaid", "disease")

array([[0.38631624]], dtype=float32)

def word_analogy(a,b,c, model, topn = 3):
    """
    a is to b as c is to ?
    Uses vector arithmetic: result ≈ b - a + c
    """
    if hasattr(model, 'wv'):
        return model.wv.most_similar(positive=[b, c], negative=[a], topn=topn)
    else:
        return model.most_similar(positive=[b, c], negative=[a], topn=topn)

word_analogy("man", "king", "woman", w2v_model),  word_analogy("man", "king", "woman", w2v_pretrained_model)

([('themes', 0.5876736640930176),
  ('george', 0.5738810896873474),
  ('accumulate', 0.5644158720970154)],
 [('queen', 0.7118193507194519),
  ('monarch', 0.6189674139022827),
  ('princess', 0.5902431011199951)])

Vectors represent context similarity in the corpus, which should represent meaning similiarity in the corpus. They reflect what is said in the corpus, whatever that is:

word_analogy("man", "doctor", "woman", w2v_pretrained_model), word_analogy("woman", "doctor", "man", w2v_pretrained_model)

([('gynecologist', 0.7093892097473145),
  ('nurse', 0.6477287411689758),
  ('doctors', 0.6471460461616516)],
 [('physician', 0.6463665962219238),
  ('doctors', 0.5858404040336609),
  ('surgeon', 0.5723941326141357)])

def cluster_and_plot(X, label):
    kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)

    colors = ["#e74c3c", "#3498db", "#2ecc71"]

    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(paper_ids)-1))
    coords_tsne = tsne.fit_transform(X)

    fig, ax = plt.subplots(figsize=(10, 7))
    for c in range(3):
        mask = clusters == c
        ax.scatter(coords_tsne[mask, 0], coords_tsne[mask, 1], c=colors[c],
                s=100, label=f"Cluster {c}", edgecolors="white", linewidth=0.5)
        for i in np.where(mask)[0]:
            ax.annotate(us["speaker"][i], (coords_tsne[i, 0], coords_tsne[i, 1]),
                        fontsize=7, ha="center", va="bottom")

    ax.set_xlabel("t-SNE 1")
    ax.set_ylabel("t-SNE 2")
    ax.set_title(label)
    ax.legend()
    plt.tight_layout()
    plt.show()

tfidf_lda = TfidfVectorizer(stop_words="english")
X = tfidf_lda.fit_transform(us["text"])
cluster_and_plot(X, "tfidf")

sbert = SentenceTransformer("all-MiniLM-L6-v2")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 6647.48it/s]


BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2

Key                     | Status     |  | 

------------------------+------------+--+-

embeddings.position_ids | UNEXPECTED |  | 



Notes:

- UNEXPECTED:   can be ignored when loading from different task/architecture; not ok if you expect identical arch.

X = sbert.encode(us["text"].tolist())
cluster_and_plot(X, "sbert")

Training a language model

Autoregressive language model: given the previous tokens, we want to predict the next tokens

BPE

Transformers use subword tokenization, unlike the word == token that we have used so far.

BPE builds a vocabulary by merging frequent character pairs.

Define a target vocabulary size.

Start with individual characters (a, b, c, …), then iteratively merges the most frequent adjacent pair into a single token, repeating until a target vocabulary size is reached.

This lets the model represent common words as single tokens while handling rare or unknown words by decomposing them into smaller subword pieces.

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(vocab_size=2500, special_tokens=["[UNK]"]) # special token for unknown tokens
tokenizer.train_from_iterator(us["text"], trainer=trainer)

tokenized = [tokenizer.encode(t).tokens for t in us["text"]]
[t[:10] for t in tokenized[:3]]

[['Fellow', '-', 'C', 'it', 'iz', 'ens', 'of', 'the', 'Sen', 'ate'],
 ['Fellow',
  'citizens',
  ',',
  'I',
  'am',
  'again',
  'called',
  'upon',
  'by',
  'the'],
 ['When', 'it', 'was', 'first', 'per', 'ce', 'ived', ',', 'in', 'early']]

Embeddings

As seen before, but computed from subwords tokens.

w2v = Word2Vec(sentences=tokenized, vector_size=128, window=5, min_count=2, workers=4)
vocab = w2v.wv.key_to_index  # only trained tokens

w2v.wv["international"]

array([-0.02099682, -0.26246765,  0.17215264,  0.10287725,  0.18350555,
       -0.03710234,  0.1722833 ,  0.03048883, -0.00525223, -0.02247564,
        0.320979  , -0.09180356, -0.11842522, -0.13442157,  0.3271749 ,
        0.24858607, -0.07478909, -0.02689948, -0.20281672,  0.04279773,
        0.2002195 ,  0.28864926, -0.12873678, -0.32007623, -0.32968724,
        0.0466402 , -0.17337702,  0.01203176,  0.09860775, -0.09748263,
       -0.21236329, -0.03642882, -0.10875434,  0.07205793, -0.09943776,
        0.1452768 ,  0.2886817 ,  0.01141961,  0.14053734, -0.00852372,
        0.04174591,  0.16742122, -0.0289951 , -0.11426713,  0.2748173 ,
        0.00884397, -0.13019669, -0.16257633,  0.01069543, -0.04314931,
       -0.00688673,  0.01693919,  0.05730288,  0.24617828,  0.0884894 ,
       -0.00738741,  0.35180786, -0.13432164, -0.12747686,  0.10428333,
       -0.00085651, -0.13728805,  0.14409114, -0.1372666 ,  0.13265122,
       -0.04551877, -0.05324395,  0.05388435, -0.1087909 , -0.25990108,
        0.07634247, -0.06210941, -0.35482478, -0.11476447,  0.14741567,
       -0.2627897 , -0.06600689, -0.12050465, -0.20552929,  0.14045468,
       -0.02441647, -0.23963544,  0.12583525,  0.40702727,  0.16565253,
        0.2704503 ,  0.1444806 , -0.08679224,  0.10398602,  0.2909712 ,
       -0.09206758, -0.13204464, -0.05071013, -0.04830186,  0.30153158,
        0.0136872 , -0.20705487, -0.12898049, -0.13317327, -0.03317142,
       -0.2039833 , -0.12837198, -0.02121176,  0.09832101,  0.11794222,
       -0.09830709, -0.09749945,  0.04073918,  0.05384138, -0.3284526 ,
        0.14513792, -0.08966366,  0.0973779 ,  0.26014432, -0.05874548,
       -0.0722076 ,  0.1327287 ,  0.09591641,  0.11413575, -0.04559002,
       -0.1594712 , -0.13738623, -0.2087583 ,  0.0725778 , -0.1647514 ,
       -0.13576484, -0.07975017, -0.02259468], dtype=float32)

Training data

Let’s say we want to predict a token using the last 5 tokens.

CONTEXT = 5

records_word, records_vec = [], []

for tokens in tokenized:
    tokens = [t for t in tokens if t in vocab]
    for i in range(CONTEXT, len(tokens)):
        ctx   = tokens[i - CONTEXT : i]   # 5 context tokens
        target = tokens[i]
        records_word.append(ctx + [target])
        records_vec.append({
            "X": np.stack([w2v.wv[t] for t in ctx]),  
            "y": w2v.wv[target],                        
        })

# in "word space"
cols = [f"c{i}" for i in range(CONTEXT)] + ["target"]
df_words = pd.DataFrame(records_word, columns=cols)

# in "vector space"
X_vec = np.stack([r["X"] for r in records_vec])   # (N, 5, 128)
y_vec = np.stack([r["y"] for r in records_vec])   # (N, 128)

Prediction

So now we need to define what computations we are going to apply on the 5-context tokens to try and predict the target tokens.

For instance we may average the 5 tokens and take the closest token to this average as our prediction.

X_avg = X_vec.mean(axis=1) # compute average

# compute cosine similarity: dot product divided by the norms
X_norm = normalize(X_avg)                      
W_norm = normalize(w2v.wv.vectors)             
cos_sim = X_norm @ W_norm.T                    

pred_idx = cos_sim.argmax(axis=1) # prediction is most similar token

# back to word space
pred_tokens = [w2v.wv.index_to_key[i] for i in pred_idx]
df_words["pred"] = pred_tokens
df_words.head(15)

	c0	c1	c2	c3	c4	target	pred
0	Fellow	-	C	it	iz	ens	ply
1	-	C	it	iz	ens	of	ying
2	C	it	iz	ens	of	the	Republic
3	it	iz	ens	of	the	Sen	parts
4	iz	ens	of	the	Sen	ate	ation
5	ens	of	the	Sen	ate	and	ation
6	of	the	Sen	ate	and	of	ing
7	the	Sen	ate	and	of	the	ing
8	Sen	ate	and	of	the	H	ing
9	ate	and	of	the	H	ouse	ation
10	and	of	the	H	ouse	of	ation
11	of	the	H	ouse	of	Re	of
12	the	H	ouse	of	Re	present	ation
13	H	ouse	of	Re	present	atives	whole
14	ouse	of	Re	present	atives	:	rights

Averaging is probably not great.

Perhaps an MLP? (see: class on modelling)

\[\hat{y} = W_2 \cdot \text{ReLU}(W_1 \cdot \bar{x} + b_1) + b_2\]

where \(\bar{x} \in \mathbb{R}^{128}\) is the averaged context, \(W_1 \in \mathbb{R}^{256 \times 128}\), \(W_2 \in \mathbb{R}^{V \times 256}\), and \(\hat{y} \in \mathbb{R}^V\) are logits over vocabulary.

X_torch = torch.tensor(X_vec.mean(axis=1), dtype=torch.float32) 
true_idx = np.array([w2v.wv.key_to_index[t] for t in df_words["target"].astype(str)])
y_torch = torch.tensor(true_idx, dtype=torch.long)

dataset = TensorDataset(X_torch, y_torch)
loader  = DataLoader(dataset, batch_size=512, shuffle=True)

V = len(w2v.wv)  # vocab size

mlp = nn.Sequential(
    nn.Linear(128, 26),
    nn.ReLU(),
    nn.Linear(26, V),
)

model_graph = draw_graph(
    mlp,
    input_size=(1, 128),
    expand_nested=True,
    show_shapes=True,
)

model_graph.visual_graph.render("mlp_graph", format="png", cleanup=True)
from IPython.display import Image as IPImage
IPImage("mlp_graph.png")

Define a loss function, i.e. a measure of how bad predictions are.

For language models, the loss is cross-entropy, i.e. negative log-probability the model assigns to the correct next token:

\[\mathcal{L}(\theta) = -\frac{1}{N} \sum_{i=1}^{N} \log P_\theta(w_i \mid w_1, \ldots, w_{i-1})\]

\[\mathcal{L} = -\frac{1}{N}\sum_{i=1}^{N} \log \frac{e^{\hat{y}_{i,y_i}}}{\sum_{j=1}^{V} e^{\hat{y}_{i,j}}}\]

where \(\hat{y}_{i,j}\) is the \(j\)-th logit for sample \(i\) and \(y_i \in \{1, \dots, V\}\) is the true token index.

\[\text{Perplexity} = e^{\mathcal{L}}\]

Perplexity is the “effective number of equally likely next tokens.” If PPL = 50, the model is as uncertain as if choosing uniformly among 50 candidates.

optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-3)
loss_fn   = nn.CrossEntropyLoss()

We are going to find the parameters of the layers, of which there are:

sum(p.numel() for p in mlp.parameters())

through gradient descent. How does that work?

The loss function is a function of the training data and the parameters that outputs a real number. We want to find the parameters values that minimize the output of the loss function, so we only consider how \(\Lambda\) varies wrt parameters values and consider the training data as fixed.

Let’s define a function \(f(x, y)\) - two parameters, x and y - which we are going to minimize, and its gradient df.

def f(x, y):
    bowl = 0.15 * (x**2 + y**2)
    canyon = -2 * np.exp(- (y - 0.6*x)**2)
    slope = x * (1 / (1 + np.exp(3*(x - 1))))
    ripples = np.sin(x*y/10) 
    return bowl + canyon + slope + ripples    

def df(x, y, eps=1e-5):
    dfdx = (f(x + eps, y) - f(x - eps, y)) / (2 * eps)
    dfdy = (f(x, y + eps) - f(x, y - eps)) / (2 * eps)
    return np.array([dfdx, dfdy])

def gradient_descent(lr=0.25, n_steps=100):
    x, y = -9.5, -9.5
    path = [(x, y, f(x, y))]
    for _ in range(n_steps):
        grad = df(x, y)
        x -= lr * grad[0]
        y -= lr * grad[1]
        x = np.clip(x, -10, 10)
        y = np.clip(y, -10, 10)
        path.append((x, y, f(x, y)))
    return np.array(path)  

x_range = np.linspace(-10, 10, 100)
y_range = np.linspace(-10, 10, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = f(X, Y)

path = gradient_descent()

fig = plt.figure(figsize=(14, 9))
ax = fig.add_subplot(111, projection="3d")

ax.plot_surface(X, Y, Z, cmap=cm.coolwarm, alpha=0.75,
                linewidth=0, antialiased=True, rstride=3, cstride=3)

ax.set_xlabel("Parameter x", fontsize=11, labelpad=10)
ax.set_ylabel("Parameter y", fontsize=11, labelpad=10)
ax.set_zlabel("Loss f(x,y)", fontsize=11, labelpad=10)
ax.view_init(elev=45, azim=+75)

line,  = ax.plot([], [], [], color="yellow", linewidth=2, zorder=5)
point, = ax.plot([], [], [], "o", color="red", markersize=6, zorder=6)
title  = ax.set_title("")

def init():
    line.set_data([], [])
    line.set_3d_properties([])
    point.set_data([], [])
    point.set_3d_properties([])
    return line, point, title

def update(frame):
    xs, ys, zs = path[:frame+1, 0], path[:frame+1, 1], path[:frame+1, 2]
    line.set_data(xs, ys)
    line.set_3d_properties(zs)
    point.set_data([xs[-1]], [ys[-1]])
    point.set_3d_properties([zs[-1]])
    title.set_text(f"Step {frame:03d} | loss = {zs[-1]:.4f}")
    return line, point, title

anim = FuncAnimation(fig, update, frames=len(path),
                     init_func=init, interval=60, blit=False)

from matplotlib.animation import PillowWriter
anim.save("gradient_descent.gif", writer=PillowWriter(fps=15))

from IPython.display import HTML
HTML('<img src="gradient_descent.gif" width="700">')

from scipy.optimize import minimize
argmin = minimize(lambda p: f(*p), x0=[0, 0]).x
argmin

array([-1.97299233, -1.05430789])

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(path[:, 2], linewidth=1.5)
ax.set_xlabel("Step")
ax.set_ylabel("Loss f(x,y)")
ax.set_title("Gradient Descent — Loss Curve")
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

Actual optimization is done through more complex, but conceptually similar methods. Also, local minima are rare in high dimensions.

Now let’s apply this to our MLP:

device = "cuda" if torch.cuda.is_available() else "cpu"
mlp = mlp.to(device)
X_torch = X_torch.to(device)
y_torch = y_torch.to(device)

# Rebuild the loader with tensors on the right device
dataset = TensorDataset(X_torch, y_torch)
loader = DataLoader(dataset, batch_size=512, shuffle=True)

for epoch in range(10):
    mlp.train()
    total_loss = 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)  # belt and suspenders
        optimizer.zero_grad()
        loss = loss_fn(mlp(xb), yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(xb)
    print(f"Epoch {epoch+1:02d} | loss: {total_loss/len(dataset):.4f}")

mlp.eval()
with torch.no_grad():
    logits = mlp(X_torch)
    pred_idx = logits.argmax(dim=1).cpu().numpy()  # .cpu() needed if on GPU

df_words["pred"] = [w2v.wv.index_to_key[i] for i in pred_idx]
df_words.head(15)

Epoch 01 | loss: 6.5249
Epoch 02 | loss: 6.2636
Epoch 03 | loss: 6.2160
Epoch 04 | loss: 6.1841
Epoch 05 | loss: 6.1604
Epoch 06 | loss: 6.1423
Epoch 07 | loss: 6.1267
Epoch 08 | loss: 6.1127
Epoch 09 | loss: 6.0999
Epoch 10 | loss: 6.0875

	c0	c1	c2	c3	c4	target	pred
0	Fellow	-	C	it	iz	ens	,
1	-	C	it	iz	ens	of	,
2	C	it	iz	ens	of	the	.
3	it	iz	ens	of	the	Sen	of
4	iz	ens	of	the	Sen	ate	,
5	ens	of	the	Sen	ate	and	,
6	of	the	Sen	ate	and	of	the
7	the	Sen	ate	and	of	the	the
8	Sen	ate	and	of	the	H	the
9	ate	and	of	the	H	ouse	the
10	and	of	the	H	ouse	of	the
11	of	the	H	ouse	of	Re	the
12	the	H	ouse	of	Re	present	of
13	H	ouse	of	Re	present	atives	the
14	ouse	of	Re	present	atives	:	the

Not great!

Moderns LLMs use self-attention layers extensively.

It lets every token directly attend to every other token in parallel:

\[\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right) V\]

Given our inputs \(X\), we might want to weight them by some matrix to give more weights to tokens that will be important for prediction and less to others.

Let’s start from the Gram matrix of a sequence of token embeddings

\[XX^\top\]

Each entry \((i,j)\) is the dot product of token \(i\) and token \(j\).

This tell how close in embedding space (\(\approx\) meaning, hopefully) two tokens are.

We can normalize using softmax to get weights between 0 and 1, i.e. each row is a probability distribution over all tokens, peaked on the most similar ones.

So a good weighting matrix might be \(\text{softmax}(XX^\top)\), from which we compute our weighted inputs \(\text{softmax}(XX^\top) X\)

In practice we normalize by \(1/\sqrt{d}\) to keep the inputs to softmax in a reasonable range where the distribution stays smooth and gradients flow.

So:

\[\text{softmax}\left(\frac{XX^\top}{\sqrt{d}}\right) X\]

The problem with this raw form is that similarity in embedding space is similarity on meaning

However we may want a broader notion of “similarity”, e.g. a verb & a noun will have different meanings/embeddings yet still be highly relevant to each other syntactically (e.g. a verb & its subject).

Consider the sentence “the cat ate the fish”

“Cat” and “ate” are dissimilar in meaning, but similar in the spaces of “noun available as subject” vs “verb looking for its subject”

To allow that we are going to project the embeddings by some weights (which we are going to learn during training): \(W_Q, W_K, W_V\) before computing the gram matrix to move from meaning to other dimensions

so let \(Q = XW_Q\), \(K = XW_K\) and \(V = XW_V\),

\[\text{softmax}\left(\frac{QK^\top}{\sqrt{d}}\right) V\]

This is self attention!

Each token’s output is a weighted average of all other tokens.

Then a transformer block is:

Input → [Self-Attention] → Add & LayerNorm [x + self-attention(x), then normalize) → [Feedforward like an MLP] → Add & LayerNorm → Output

BPE_VOCAB_SIZE = 8_000
W2V_DIM        = 128
W2V_WINDOW     = 4
W2V_EPOCHS     = 10
BLOCK_SIZE     = 64
BATCH_SIZE     = 16
EMBED_DIM      = 128
N_LAYERS       = 4
LR             = 1.5e-3
STEPS          = 1_000

assert EMBED_DIM == W2V_DIM
device = "cuda" if torch.cuda.is_available() else "cpu"

corpus = " ".join(us["text"].tolist())

tokenizer = Tokenizer(BPE(unk_token="[UNK]", end_of_word_suffix="</w>"))
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(
    vocab_size=BPE_VOCAB_SIZE,
    special_tokens=["[UNK]"],
    end_of_word_suffix="</w>",
)
tokenizer.train_from_iterator([corpus], trainer=trainer)
tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")

encoding    = tokenizer.encode(corpus)
tokens_flat = encoding.tokens   
ids         = encoding.ids      
vocab_size  = tokenizer.get_vocab_size()

data = torch.tensor(ids, dtype=torch.long)

sentences = [
    tokens_flat[i : i + BLOCK_SIZE]
    for i in range(0, len(tokens_flat), BLOCK_SIZE)
]

w2v = Word2Vec(
    sentences=sentences,
    vector_size=W2V_DIM,
    window=W2V_WINDOW,
    min_count=1,
    workers=4,
    epochs=W2V_EPOCHS,
    sg=1,    
    negative=5,
    seed=42,
)

vocab      = tokenizer.get_vocab() 
rng        = np.random.default_rng(42)

w2v_matrix = rng.uniform(
    -0.5 / W2V_DIM, 0.5 / W2V_DIM, size=(vocab_size, W2V_DIM)
).astype(np.float32)

missing = 0
for token, idx in vocab.items():
    if token in w2v.wv:
        w2v_matrix[idx] = w2v.wv[token]
    else:
        missing += 1

w2v_embeddings = torch.tensor(w2v_matrix)

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.query    = nn.Linear(embed_dim, head_dim, bias=False)
        self.key      = nn.Linear(embed_dim, head_dim, bias=False)
        self.value    = nn.Linear(embed_dim, head_dim, bias=False)
        self.head_dim = head_dim

    def forward(self, x):
        B, T, _ = x.shape
        q, k, v = self.query(x), self.key(x), self.value(x)
        attn = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()
        attn = attn.masked_fill(mask, float("-inf"))
        return F.softmax(attn, dim=-1) @ v


class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.attention = SelfAttention(embed_dim, head_dim)
        self.ln1  = nn.LayerNorm(embed_dim)
        self.ln2  = nn.LayerNorm(embed_dim)
        self.ff   = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim), nn.GELU(),
            nn.Linear(4 * embed_dim, embed_dim),
        )
        self.proj = nn.Linear(head_dim, embed_dim, bias=False)

    def forward(self, x):
        x = x + self.proj(self.attention(self.ln1(x)))
        x = x + self.ff(self.ln2(x))
        return x


class NanoGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, n_layers=2, block_size=128):
        super().__init__()
        self.block_size = block_size
        self.tok_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(block_size, embed_dim)
        self.blocks  = nn.Sequential(
            *[TransformerBlock(embed_dim, embed_dim) for _ in range(n_layers)]
        )
        self.ln_f = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)

    def forward(self, idx):
        B, T = idx.shape
        x = self.tok_emb(idx) + self.pos_emb(torch.arange(T, device=idx.device))
        return self.head(self.ln_f(self.blocks(x)))

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0):
        for _ in range(max_new_tokens):
            logits = self(idx[:, -self.block_size:])[:, -1, :] / temperature
            idx    = torch.cat([idx, torch.multinomial(F.softmax(logits, dim=-1), 1)], dim=1)
        return idx


n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]

model = NanoGPT(vocab_size, EMBED_DIM, N_LAYERS, BLOCK_SIZE).to(device)
with torch.no_grad():
    model.tok_emb.weight.copy_(w2v_embeddings.to(device))

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

def get_batch(split):
    d  = train_data if split == "train" else val_data
    ix = torch.randint(len(d) - BLOCK_SIZE, (BATCH_SIZE,))
    x  = torch.stack([d[i : i + BLOCK_SIZE]         for i in ix]).to(device)
    y  = torch.stack([d[i + 1 : i + BLOCK_SIZE + 1] for i in ix]).to(device)
    return x, y

train_losses = []
for step in range(STEPS):
    xb, yb = get_batch("train")
    loss   = F.cross_entropy(model(xb).view(-1, vocab_size), yb.view(-1))
    optimizer.zero_grad(); loss.backward(); optimizer.step()

    if step % 100 == 0 or step == STEPS - 1:
        xv, yv = get_batch("val")
        with torch.no_grad():
            val_loss = F.cross_entropy(model(xv).view(-1, vocab_size), yv.view(-1))
        ppl = math.exp(val_loss.item())
        print(f"Step {step:4d} | train : {loss.item():.3f} | val : {val_loss.item():.3f} | PPL : {ppl:.1f}")
        train_losses.append({"step": step, "train": loss.item(), "val": val_loss.item(), "ppl": ppl})


loss_df = pd.DataFrame(train_losses)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(loss_df["step"], loss_df["train"], label="Train", marker="o")
axes[0].plot(loss_df["step"], loss_df["val"],   label="Val",   marker="s")
axes[0].set(xlabel="Step", ylabel="Cross-entropy loss", title="Loss Curve")
axes[0].legend(); axes[0].grid(True, alpha=0.3)

axes[1].plot(loss_df["step"], loss_df["ppl"], label="Val PPL", marker="s", color="orange")
axes[1].set(xlabel="Step", ylabel="Perplexity"); axes[1].legend(); axes[1].grid(True, alpha=0.3)

plt.tight_layout(); plt.show()
print(f"Perplexity : {loss_df['ppl'].iloc[-1]:.1f}")

Step    0 | train : 9.183 | val : 8.953 | PPL : 7732.6
Step  100 | train : 6.432 | val : 6.849 | PPL : 943.1
Step  200 | train : 5.835 | val : 6.317 | PPL : 553.7
Step  300 | train : 5.710 | val : 5.902 | PPL : 365.7
Step  400 | train : 5.297 | val : 6.021 | PPL : 411.9
Step  500 | train : 5.587 | val : 5.965 | PPL : 389.4
Step  600 | train : 5.282 | val : 6.034 | PPL : 417.2
Step  700 | train : 5.222 | val : 5.939 | PPL : 379.4
Step  800 | train : 4.997 | val : 6.293 | PPL : 540.9
Step  900 | train : 4.934 | val : 5.951 | PPL : 384.1
Step  999 | train : 4.837 | val : 5.452 | PPL : 233.3

Perplexity : 233.3

Temperature rescales logits before softmax: p_i = softmax(z_i / T)

prompt   = "My fellow americans, "

seed_ids = tokenizer.encode(prompt).ids
seed_idx = torch.tensor([seed_ids], dtype=torch.long, device=device)

for temp in [0.1, 0.5, 1.5]:
    out  = model.generate(seed_idx.clone(), max_new_tokens=80, temperature=temp)
    ids  = out[0].tolist()
    text = tokenizer.decode(ids)
    print(f"T={temp}: {text}\n")

T=0.1: My fellow americans , and the world , and the world , and the world , and the world , and the world , and the world . The world is a new century , and the world ' s is a new century , and we have the world . We have no longer be done , and we have the world . We have not , and we have the world . We have the world , and we have no longer

T=0.5: My fellow americans , and the people of the world ' s people , that we can not do not to the world , but we are to it , we have the world . We have to the world . We have a new task , and we must not do , and by the general and fight to the world , and you , and God . We have a new act of the people , we have been long as we

T=1.5: My fellow americans , and are doing certain independent President millions them treasures on either that embraced questions heartily and singular undisturbed if useful o raween invely now unknown shows a nation . These purpose of a trained invariably Iâ economy passion let us we learned in privileges must have done dependent . Even on righteousness part already completed legislation seeking against the complaplaced by annual prness in freedom incidental after so long thoughtfully approval of enemies care will regions

random_ids  = torch.cat([
    seed_idx,
    torch.randint(0, vocab_size, (1, 80), device=device)
], dim=1)
random_text = tokenizer.decode(random_ids[0].tolist())
random_text

'My fellow americans , companfortunes individuAuthor characterize Enmake restrain larg/ reforms conquer alleled adversIf debts hurrPeopabinet conquheretofore rising immedibrethren benevolcombined gresuntil multiplunavominor memories JohignorIntericulnegoclean test ates friend aled unciffronmarine renewing Among kindness approach solemn recent instrumentality interpretation too auspices pioneers forefathers sacred vain administration electors endured valpracticfearless My entertained 2 unrivalannual ignlengclosed recognht situAcsun1776'

Social science applications

Gendered school reports

Do French teachers use different vocabulary to describe boys and girls at the same academic level?

Can the sex of a student be predicted from the words used in their report card, after controlling for performance?

https://www.ipp.eu/wp-content/uploads/2026/01/Note_IPP_Genres_appreciations_121.pdf

# Synthetic data to illustrate the technique

female_words = [
    "sérieuse", "exemplaire", "persévérer", "appliquée", "studieuse",
    "difficultés", "fragilités", "failles", "hésitations", "découragée",
    "irréprochable", "confiance", "bravo", "efforts", "sérénité"
]
male_words = [
    "intuition", "passion", "curiosité", "idées", "aisance",
    "brouillon", "dilettante", "désinvolture", "puéril", "agité",
    "s'amuse", "superficiel", "endormi", "inexploité", "réveil"
]

neutral_words = ["travail", "résultats", "trimestre", "niveau", "progrès"]

def generate_report(sex, n=300):
    words = []
    if sex == "female":
        words += np.random.choice(female_words, size=4, replace=True).tolist()
        words += np.random.choice(male_words, size=1, replace=True).tolist()
    else:
        words += np.random.choice(male_words, size=4, replace=True).tolist()
        words += np.random.choice(female_words, size=1, replace=True).tolist()
    words += np.random.choice(neutral_words, size=3, replace=True).tolist()
    np.random.shuffle(words)
    return " ".join(words)

n_students = 500
sexes = np.random.choice(["female", "male"], size=n_students)
reports = [generate_report(s) for s in sexes]
y = (sexes == "female").astype(int)

vectorizer = CountVectorizer(max_features=200)
X = vectorizer.fit_transform(reports)
clf = LogisticRegression(
    penalty="l1", # lasso
    C=0.5,
    solver="liblinear",
    random_state=42
    )

scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy")

print(f"Gap: +{(scores.mean()-0.5)*100:.1f} pp")

# Visualise the most predictive words
clf.fit(X, y)
feature_names = vectorizer.get_feature_names_out()
coefs = pd.Series(clf.coef_[0], index=feature_names).sort_values()
top_male = coefs.head(8)    # negative coef → male
top_female = coefs.tail(8)  # positive coef → female
top = pd.concat([top_male, top_female])

colors = ["#2c7bb6" if c < 0 else "#d7191c" for c in top]
fig, ax = plt.subplots(figsize=(9, 5))
top.plot(kind="barh", color=colors, ax=ax, edgecolor="white")
ax.axvline(0, color="black", linewidth=0.8)
ax.set_xlabel("LASSO coefficient (negative → boys, positive → girls)")
ax.set_title("Most predictive words for sex classification in report cards")
blue_patch = mpatches.Patch(color="#2c7bb6", label="Boys")
red_patch  = mpatches.Patch(color="#d7191c", label="Girls")
ax.legend(handles=[blue_patch, red_patch])
plt.tight_layout()
plt.show()

Gap: +50.0 pp

/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1135: FutureWarning: 'penalty' was deprecated in version 1.8 and will be removed in 1.10. To avoid this warning, leave 'penalty' set to its default value and use 'l1_ratio' or 'C' instead. Use l1_ratio=0 instead of penalty='l2', l1_ratio=1 instead of penalty='l1', and C=np.inf instead of penalty=None.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1160: UserWarning: Inconsistent values: penalty=l1 with l1_ratio=0.0. penalty is deprecated. Please use l1_ratio only.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1135: FutureWarning: 'penalty' was deprecated in version 1.8 and will be removed in 1.10. To avoid this warning, leave 'penalty' set to its default value and use 'l1_ratio' or 'C' instead. Use l1_ratio=0 instead of penalty='l2', l1_ratio=1 instead of penalty='l1', and C=np.inf instead of penalty=None.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1160: UserWarning: Inconsistent values: penalty=l1 with l1_ratio=0.0. penalty is deprecated. Please use l1_ratio only.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1135: FutureWarning: 'penalty' was deprecated in version 1.8 and will be removed in 1.10. To avoid this warning, leave 'penalty' set to its default value and use 'l1_ratio' or 'C' instead. Use l1_ratio=0 instead of penalty='l2', l1_ratio=1 instead of penalty='l1', and C=np.inf instead of penalty=None.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1160: UserWarning: Inconsistent values: penalty=l1 with l1_ratio=0.0. penalty is deprecated. Please use l1_ratio only.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1135: FutureWarning: 'penalty' was deprecated in version 1.8 and will be removed in 1.10. To avoid this warning, leave 'penalty' set to its default value and use 'l1_ratio' or 'C' instead. Use l1_ratio=0 instead of penalty='l2', l1_ratio=1 instead of penalty='l1', and C=np.inf instead of penalty=None.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1160: UserWarning: Inconsistent values: penalty=l1 with l1_ratio=0.0. penalty is deprecated. Please use l1_ratio only.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1135: FutureWarning: 'penalty' was deprecated in version 1.8 and will be removed in 1.10. To avoid this warning, leave 'penalty' set to its default value and use 'l1_ratio' or 'C' instead. Use l1_ratio=0 instead of penalty='l2', l1_ratio=1 instead of penalty='l1', and C=np.inf instead of penalty=None.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1160: UserWarning: Inconsistent values: penalty=l1 with l1_ratio=0.0. penalty is deprecated. Please use l1_ratio only.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1135: FutureWarning: 'penalty' was deprecated in version 1.8 and will be removed in 1.10. To avoid this warning, leave 'penalty' set to its default value and use 'l1_ratio' or 'C' instead. Use l1_ratio=0 instead of penalty='l2', l1_ratio=1 instead of penalty='l1', and C=np.inf instead of penalty=None.
  warnings.warn(
/opt/python/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py:1160: UserWarning: Inconsistent values: penalty=l1 with l1_ratio=0.0. penalty is deprecated. Please use l1_ratio only.
  warnings.warn(

Embeddings

https://github.com/prodriguezsosa/EmbeddingRegression
https://doi.org/10.1017/S0003055422001228
Compute embeddings of the same word in different contexts
Fit the regression \(Embeddings = \alpha + \beta Covariates + \epsilon\)
Can do hypothesis testing about how embeddings vary depending on covariates
Do Republicans and Democrats use the same words to mean different things?
Can we quantify partisan divergence in how concepts are framed? This is a different question from word frequency!

Vocabulary Comparison

https://github.com/MS20190155/Measuring-Corporate-Culture-Using-Machine-Learning?tab=readme-ov-file
https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3256608
Measure “corporate culture” through vocabulary in earnings reports
Take a specific cultural trait, e.g. “teamwork”
Choose a set of related words, here: collaboration, collaborative, cooperate, cooperation, cooperative
Define the dictionary of “teamwork”-related words as the 500 words closest to the average word in the set as measured by cosine similarity
In each earning compute the TF-IDF scores of those words -> high “teamwork”-words TF-IDF score = firm probably values team work

Define “strong culture” as “Strong culture is an indicator variable that takes the value of one if the sum of a firm’s five cultural values is in the top quartile across all Compustat firms in a year, and zero otherwise.”

Named Entity Recognition (NER)

https://minio.lab.sspcloud.fr/ssphub/diffusion/website/2025-12-network/2_JOCAS_DARES.pdf
From DARES (French Labor Statistical institute)
Collect online job postings
Train model to dectect tokens about digital skills based on human annotation to predict for out of sample tokens

Topic Modeling

https://www.nber.org/system/files/working_papers/w29686/w29686.pdf
What do Americans spontaneously think about when asked about taxes? i.e. not with closed questions
Open-ended survey responses cluster around fairness, government trust, loopholes.
Partisan gaps
In the paper, topic words are hand picked by researches, but one could use LDA to not specify topics in advances

survey_responses = [
    "The rich should pay more it's not fair that billionaires pay less than workers",
    "Government wastes our money on programs that don't work",
    "Companies use loopholes and offshore accounts to avoid taxes entirely",
    "I worry about social security and healthcare being cut to fund tax breaks",
    "Fairness is the most important principle taxes should be equal for everyone",
    "Too much bureaucracy in the tax system it's too complicated to file",
    "Wealthy people hire lawyers to avoid taxes while regular people pay full rate",
    "Trust in government is low I don't know where my money goes",
    "Tax cuts for corporations don't create jobs they just enrich shareholders",
    "The estate tax is unfair families should not lose their farm to inheritance taxes",
    "Hard working people are penalized while those who cheat the system benefit",
    "Congress needs to close loopholes and make the system simpler and transparent",
    "Hedge fund managers pay lower rates than nurses because of capital gains loopholes that's backwards",
    "The top one percent owns more than the bottom fifty but somehow always gets the bigger tax break",
    "Stock buybacks should be taxed the same as dividends companies use them to dodge obligations",
    "I pay taxes every year and watch it disappear into contractors and consultants with nothing to show",
    "Foreign aid and bloated agencies get funded while local roads and schools fall apart",
    "Audits hit small businesses and middle class families while corporations get away with anything",
    "The tax code is thousands of pages long written by lobbyists for lobbyists not regular citizens",
    "I spent four hours on my return for a refund of sixty dollars something is deeply broken",
    "Why do I need to pay someone to tell the government what it already knows from my employer",
    "Self employed people pay double the social security tax that's a penalty for working for yourself",
    "I follow every rule and still owe money at the end of the year while big companies pay nothing",
    "My neighbor opened an LLC and suddenly deducts everything meanwhile I get nothing",
    "A flat tax sounds fair until you realize ten percent means nothing to a billionaire and everything to me",
    "Automatic filing like other countries do would save billions in wasted time and compliance costs",
    "We need real time reporting and public disclosure not self reported honor system accounting",
    "Roads schools firefighters none of that is free someone has to pay and it should be proportional",
    "I don't mind paying taxes if I could see the results but the connection between payment and service is invisible",
    "Other countries get healthcare and transit from their taxes we get wars and bank bailouts",
]

vectorizer = CountVectorizer(stop_words="english", min_df=1)
dtm = vectorizer.fit_transform(survey_responses)
vocab = vectorizer.get_feature_names_out()

lda = LatentDirichletAllocation(n_components=2, random_state=42, max_iter=200)
lda.fit(dtm)

print("Top words per topic:\n")
for comp in lda.components_:
    top_words = [vocab[j] for j in comp.argsort()[-6:][::-1]]
    print(f"{', '.join(top_words)}\n\n")

Top words per topic:

don, tax, taxes, government, companies, money


pay, tax, taxes, people, fund, social