from sentence_transformers import util
def search(query, k=5):
q = model.encode([query], normalize_embeddings=True)
sims = util.cos_sim(q, emb)[0].cpu().numpy()
idx = sims.argsort()[::-1][:k]
print(f’\n=== Query: “{query}” ===’)
for rank, i in enumerate(idx, 1):
row = work.iloc[i]
print(f”\n[{rank}] sim={sims[i]:.3f} | {row[‘taxonomy_level_1’]} ”
f”| status={row[‘open_status’]}”)
print(” “, row[TEXT_COL][:260].replace(“\n”, ” “), “…”)
search(“rational points on hyperelliptic curves”)
search(“multiplicativity of maximal output p-norm of a quantum channel”)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
y = work[“open_status”].values
Xtr, Xte, ytr, yte = train_test_split(
emb, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y)
clf = LogisticRegression(max_iter=2000, class_weight=”balanced”, C=2.0)
clf.fit(Xtr, ytr)
pred = clf.predict(Xte)
print(“\n=== open_status classifier (embeddings + logistic regression) ===”)
print(classification_report(yte, pred))
fig, ax = plt.subplots(figsize=(7, 6))
ConfusionMatrixDisplay.from_predictions(
yte, pred, ax=ax, cmap=”Blues”, xticks_rotation=45,
normalize=”true”, values_format=”.2f”)
ax.set_title(“open_status confusion matrix (row-normalized)”)
plt.tight_layout(); plt.show()
sims = util.cos_sim(emb, emb).cpu().numpy()
np.fill_diagonal(sims, 0)
i, j = np.unravel_index(sims.argmax(), sims.shape)
print(f”\nMost similar pair (cos={sims[i, j]:.3f}):”)
for n in (i, j):
print(f”\n paper_id={work.iloc[n][‘paper_id’]} | ”
f”{work.iloc[n][‘taxonomy_level_1’]}”)
print(” “, work.iloc[n][TEXT_COL][:240].replace(“\n”, ” “), “…”)
print(“\nDone. Set SAMPLE_SIZE=None at the top to run on the full 14.1k rows.”)
def search(query, k=5):
q = model.encode([query], normalize_embeddings=True)
sims = util.cos_sim(q, emb)[0].cpu().numpy()
idx = sims.argsort()[::-1][:k]
print(f’\n=== Query: “{query}” ===’)
for rank, i in enumerate(idx, 1):
row = work.iloc[i]
print(f”\n[{rank}] sim={sims[i]:.3f} | {row[‘taxonomy_level_1’]} ”
f”| status={row[‘open_status’]}”)
print(” “, row[TEXT_COL][:260].replace(“\n”, ” “), “…”)
search(“rational points on hyperelliptic curves”)
search(“multiplicativity of maximal output p-norm of a quantum channel”)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
y = work[“open_status”].values
Xtr, Xte, ytr, yte = train_test_split(
emb, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y)
clf = LogisticRegression(max_iter=2000, class_weight=”balanced”, C=2.0)
clf.fit(Xtr, ytr)
pred = clf.predict(Xte)
print(“\n=== open_status classifier (embeddings + logistic regression) ===”)
print(classification_report(yte, pred))
fig, ax = plt.subplots(figsize=(7, 6))
ConfusionMatrixDisplay.from_predictions(
yte, pred, ax=ax, cmap=”Blues”, xticks_rotation=45,
normalize=”true”, values_format=”.2f”)
ax.set_title(“open_status confusion matrix (row-normalized)”)
plt.tight_layout(); plt.show()
sims = util.cos_sim(emb, emb).cpu().numpy()
np.fill_diagonal(sims, 0)
i, j = np.unravel_index(sims.argmax(), sims.shape)
print(f”\nMost similar pair (cos={sims[i, j]:.3f}):”)
for n in (i, j):
print(f”\n paper_id={work.iloc[n][‘paper_id’]} | ”
f”{work.iloc[n][‘taxonomy_level_1’]}”)
print(” “, work.iloc[n][TEXT_COL][:240].replace(“\n”, ” “), “…”)
print(“\nDone. Set SAMPLE_SIZE=None at the top to run on the full 14.1k rows.”)


