נספח ב: קוד מקור מלא
סקירה
כל הניתוחים בספר זה ניתנים לשחזור מסקריפט Python יחיד: torah_root_analyzer.py. הסקריפט דורש רק Python 3 וקובץ JSON שהורד מממשק Sefaria.org. אין נתונים קנייניים, אין תלויות חיצוניות מעבר לספרייה הסטנדרטית.
מיקומי הורדה:
- Zenodo DOI: 10.5281/zenodo.18744642
- באינטרנט: [boundbydesign.org](https://boundbydesign.org)
הפעלה
```bash
הורדת טקסט התורה מ-Sefaria.org (פעם אחת)
python3 -c "
import urllib.request, json
books = ['Genesis','Exodus','Leviticus','Numbers','Deuteronomy']
data = {}
for b in books:
url = f'https://www.sefaria.org/api/texts/{b}?lang=he'
data[b] = json.loads(urllib.request.urlopen(url).read())
with open('sefaria_torah.json','w') as f: json.dump(data, f)
"
ניתוח מילים
python3 torah_root_analyzer.py שלום ברכה תורה
הרצת ולידציה (16 מקרי בדיקה)
python3 torah_root_analyzer.py --test
מבחן Z-score (1000 ערבובים)
python3 torah_root_analyzer.py --zscore
```
הסקריפט המלא: torah_root_analyzer.py
```python
#!/usr/bin/env python3
"""
Torah Root Analyzer v9
=====================
A standalone root extraction algorithm for Biblical Hebrew (Torah).
Extracts Foundation roots from any Hebrew word using:
- Dictionary-based extraction (V1) from self-bootstrapped Sefaria.org data
- Structural fallback with YHW trapped-letter rules when V1 fails
Key rules discovered empirically:
- ו (vav) trapped: ALWAYS falls (removed)
- ה (he) trapped: ALWAYS stays (kept in mandatory root)
- י (yod) between two Foundation letters: falls
- י (yod) after א/מ + before Foundation: stays
- י (yod) after ת/נ: falls
- AMTN/BKL between two Foundation letters: part of root (kept)
- שם המפורש (יהוה): never decomposed
Results:
- Z-score: 150.49 (V1 was 57.72 — improvement of ×2.6)
- 5-fold CV: 87.4% Root+YHW meaning prediction
- Language exact match: 66.0%
- Language miss: 1.3% (723 tokens out of 54,749)
Usage:
python3 torah_root_analyzer_v9.py # analyze all Torah
python3 torah_root_analyzer_v9.py להורותם תורה ויחי # analyze specific words
python3 torah_root_analyzer_v9.py --test # run validation tests
python3 torah_root_analyzer_v9.py --zscore # run Z-score shuffle test
Author: Eran Eliahu Tuval
Data source: Sefaria.org API (public domain)
"""
import json, re, sys, os, random, statistics, time
from collections import defaultdict, Counter
============================================================
CONSTANTS
============================================================
FINAL_FORMS = {'ך':'כ','ם':'מ','ן':'נ','ף':'פ','ץ':'צ'}
The 4 groups of the Hebrew alphabet
FOUNDATION = set('גדזחטסעפצקרש') # 12 content carriers
AMTN = set('אמתנ') # 4 morphological frame
YHW = set('יהו') # 3 grammatical extension
BKL = set('בכל') # 3 syntactic wrapper
Combined sets
EXTENSION = AMTN | YHW | BKL # 10 control letters
V1 prefix/suffix lists
V1_PREFIXES = [
'וי','ות','וא','ונ','ול','וב','ומ','וה','וכ','וש',
'הת','המ','הו','ו','ה','ל','ב','מ','כ','ש','י','ת','נ','א'
]
V1_SUFFIXES = [
'ותיהם','ותיכם','יהם','יכם','ותם','ותי','ותן',
'ים','ות','הם','כם','תם','תי','נו','יו','יך','ין',
'ה','ו','י','ת','ך','ם','ן'
]
Fallback prefix/suffix lists (broader)
FB_PREFIXES = [
'ויו','ויה','ויא','ויב','ויכ','ויל','וית','וינ','וימ',
'וי','ות','וא','ונ','ומ','וה','ול','וב','וכ','וש',
'הת','הי','המ','הו','הנ','הא',
'לה','לי','לו','לא','למ','לנ','לת',
'בה','בי','בו','במ','בנ','בא','כה','כי','כא',
'ו','ה','י','ת','נ','א','מ','ל','ב','כ'
]
FB_SUFFIXES = [
'ותיהם','ותיכם','ותינו','יהם','יכם','ינו',
'ותם','ותי','ותן','ותה',
'ים','ות','הם','כם','תם','תי','נו','יו','יך','ין',
'ה','ו','י','ת','ך','ם','ן'
]
============================================================
UTILITY FUNCTIONS
============================================================
def normalize(word):
"""Normalize final forms to standard forms"""
return ''.join(FINAL_FORMS.get(c, c) for c in word)
def clean_word(word):
"""Extract only Hebrew letters from a string"""
return re.sub(r'[^\u05d0-\u05ea]', '', word)
def classify_letter(c):
"""Classify a Hebrew letter into its group"""
if c in FOUNDATION: return 'F'
if c in AMTN: return 'A'
if c in YHW: return 'H'
if c in BKL: return 'B'
return '?'
def has_foundation(word):
"""Does word contain at least one Foundation letter?"""
return any(c in FOUNDATION for c in normalize(word))
def tokenize_verse(verse):
"""Extract Hebrew words from a Sefaria verse (with HTML/cantillation marks)"""
t = re.sub(r'<[^>]+>', '', verse)
t = ''.join(' ' if ord(c) == 0x05BE else c
for c in t if not (0x0591 <= ord(c) <= 0x05C7))
return [clean_word(w) for w in t.split() if clean_word(w)]
============================================================
DICTIONARY BUILDER
============================================================
def build_dictionary(torah_data):
"""Build root dictionary from Torah text (self-bootstrapped, no external data)"""
Collect all words
all_words = []
for book in torah_data.values():
for ch in book.values():
for v in ch:
all_words.extend(tokenize_verse(v))
Count frequency of stripped forms
freq = defaultdict(int)
for w in all_words:
s = w
while s and s[0] in BKL:
s = s[1:]
s = normalize(''.join(c for c in s if c not in YHW))
if s and len(s) >= 2:
freq[s] += 1
Roots = forms appearing 3+ times
roots = {s for s, f in freq.items() if f >= 3}
return roots, freq, all_words
============================================================
V1: DICTIONARY-BASED EXTRACTION
============================================================
def extract_v1(word, roots, freq):
"""
V1: Dictionary-based root extraction.
Returns (root, found) where found=True if dictionary matched.
"""
w = normalize(clean_word(word))
if not w:
return w, False
if w in roots:
return w, True
best, best_score = None, 0
for p in [''] + V1_PREFIXES:
if p and not w.startswith(p):
continue
stem = w[len(p):]
if not stem:
continue
for s in [''] + V1_SUFFIXES:
if s and not stem.endswith(s):
continue
cand = stem[:-len(s)] if s else stem
if not cand:
continue
for x in {cand, normalize(cand)}:
if x in roots:
score = len(x) * 10000 + freq.get(x, 0)
if score > best_score:
best, best_score = x, score
if best:
return best, True
return w, False
============================================================
V9: STRUCTURAL FALLBACK
============================================================
def extract_fallback_v9(word):
"""
Structural fallback when V1 fails.
Applies trapped-YHW rules and Foundation-zone extraction.
"""
w = normalize(clean_word(word))
if not w:
return w
Rule 1: Protect שם המפורש
if 'יהוה' in w:
return 'יהוה'
Rule 2: Strip BKL prefix (outer layer only)
clean = w
while clean and clean[0] in BKL:
clean = clean[1:]
if not clean:
return w
Rule 3: Strip ו everywhere (always falls)
no_vav = clean.replace('ו', '')
if not no_vav:
no_vav = clean
Rule 4-5: Strip י in specific contexts
chars = list(no_vav)
to_remove = set()
for i in range(1, len(chars) - 1):
if chars[i] == 'י':
Find nearest non-YHW neighbor on each side
prev_non_yhw = ''
for j in range(i - 1, -1, -1):
if chars[j] not in YHW:
prev_non_yhw = chars[j]
break
next_non_yhw = ''
for j in range(i + 1, len(chars)):
if chars[j] not in YHW:
next_non_yhw = chars[j]
break
Rule 4: י between two Foundation → falls
if prev_non_yhw in FOUNDATION and next_non_yhw in FOUNDATION:
to_remove.add(i)
Rule 5: י after ת/נ → falls
elif prev_non_yhw in ('ת', 'נ'):
to_remove.add(i)
stripped = ''.join(c for i, c in enumerate(chars) if i not in to_remove)
Rule 6: Try prefix+suffix stripping on cleaned form
candidates = []
for pfx in [''] + FB_PREFIXES:
if pfx and not stripped.startswith(pfx):
continue
stem = stripped[len(pfx):]
if not stem:
continue
for sfx in [''] + FB_SUFFIXES:
if sfx and not stem.endswith(sfx):
continue
cand = stem[:-len(sfx)] if sfx else stem
if not cand:
continue
if any(c in FOUNDATION for c in cand):
candidates.append((len(cand), cand))
if not candidates:
Last resort: extract Foundation zone with trapped AMTN/BKL
found_pos = [i for i, c in enumerate(stripped) if c in FOUNDATION]
if not found_pos:
return w
first_f, last_f = found_pos[0], found_pos[-1]
result = []
for i in range(first_f, last_f + 1):
ch = stripped[i]
if ch in FOUNDATION or ch in AMTN or ch in BKL:
result.append(ch)
elif ch == 'ה': # Rule: ה always survives
result.append(ch)
return ''.join(result) if result else w
Pick shortest candidate (1-5 chars)
candidates.sort()
best = None
for length, cand in candidates:
if 1 <= length <= 5:
best = cand
break
if not best:
best = candidates[0][1]
Rule 7: Keep AMTN/BKL between Foundation letters (part of root)
found_pos = [i for i, c in enumerate(best) if c in FOUNDATION]
if len(found_pos) >= 2:
first_f, last_f = found_pos[0], found_pos[-1]
refined = []
for i, ch in enumerate(best):
if ch in FOUNDATION:
refined.append(ch)
elif ch == 'ה': # ה always stays
refined.append(ch)
elif ch in (AMTN | BKL):
if first_f <= i <= last_f:
refined.append(ch) # Between Foundations = part of root
result = ''.join(refined)
else:
Single Foundation or none: just remove remaining YHW (except ה)
result = ''.join(c for c in best if c not in YHW or c == 'ה')
return result if result else best
============================================================
V9: COMBINED EXTRACTION
============================================================
def extract_root(word, roots, freq):
"""
V9 combined extraction:
- Try V1 (dictionary) first
- If V1 fails AND word has Foundation letter(s) → structural fallback
- Otherwise return V1 result as-is
"""
v1_result, v1_found = extract_v1(word, roots, freq)
if v1_found:
return v1_result
if has_foundation(word):
return extract_fallback_v9(word)
return v1_result
def get_yhw_signature(word, root):
"""Compute YHW position signature for meaning disambiguation"""
w = normalize(clean_word(word))
root_n = normalize(root)
idx = w.find(root_n)
if idx < 0:
return 'N'
front = sum(1 for i, c in enumerate(w) if c in YHW and i < idx)
mid = sum(1 for i, c in enumerate(w) if c in YHW and idx <= i < idx + len(root_n))
back = sum(1 for i, c in enumerate(w) if c in YHW and i >= idx + len(root_n))
return f"F{front}M{mid}B{back}"
============================================================
ANALYSIS FUNCTIONS
============================================================
def analyze_word(word, roots, freq):
"""Full analysis of a single word"""
w = normalize(clean_word(word))
v1_result, v1_found = extract_v1(word, roots, freq)
v9_result = extract_root(word, roots, freq)
yhw_sig = get_yhw_signature(word, v9_result)
Layer analysis
layers = []
for c in w:
group = classify_letter(c)
layers.append(f"[{c}={group}]")
return {
'word': word,
'normalized': w,
'v1_root': v1_result,
'v1_found': v1_found,
'v9_root': v9_result,
'yhw_sig': yhw_sig,
'method': 'V1' if v1_found else ('FALLBACK' if has_foundation(word) else 'PASSTHROUGH'),
'layers': ' '.join(layers),
'structure': ''.join(classify_letter(c) for c in w),
}
def print_analysis(result):
"""Pretty-print word analysis"""
print(f"\nAnalyzing: {result['word']}")
print("=" * 60)
print(f" Normalized: {result['normalized']}")
print(f" Structure: {result['structure']}")
print(f" Layers: {result['layers']}")
print(f" V1 root: {result['v1_root']} ({'found' if result['v1_found'] else 'FAILED'})")
print(f" v9 root: {result['v9_root']} (method: {result['method']})")
print(f" YHW sig: {result['yhw_sig']}")
============================================================
Z-SCORE TEST
============================================================
Module-level globals for multiprocessing (can't pickle local functions)
_zscore_verse_roots = None
_zscore_window = 50
def _zscore_concentration(root_list):
ss = 0.0; nw = 0
for i in range(0, len(root_list) - _zscore_window, _zscore_window):
c = Counter(root_list[i:i + _zscore_window])
ss += sum(v * v for v in c.values()) / _zscore_window
nw += 1
return ss / nw if nw > 0 else 0
def _zscore_shuffle_worker(seed):
rng = random.Random(seed)
order = list(range(len(_zscore_verse_roots)))
rng.shuffle(order)
shuffled = []
for vi in order:
shuffled.extend(_zscore_verse_roots[vi])
return _zscore_concentration(shuffled)
def run_zscore_test(torah_data, roots, freq, n_shuffles=1000):
"""Run verse-level shuffle Z-score test with multiprocessing"""
global _zscore_verse_roots
from multiprocessing import Pool, cpu_count
print("Running Z-score shuffle test...")
print(f" Shuffles: {n_shuffles}")
all_words = []
verse_words = []
for book in torah_data.values():
for ch in book.values():
for v in ch:
words = tokenize_verse(v)
all_words.extend(words)
verse_words.append(words)
root_cache = {}
for w in set(all_words):
root_cache[w] = normalize(extract_root(w, roots, freq))
all_roots = [root_cache.get(w, w) for w in all_words]
_zscore_verse_roots = [[root_cache.get(w, w) for w in vw] for vw in verse_words]
real = _zscore_concentration(all_roots)
print(f" Real concentration: {real:.6f}")
n_cpus = min(cpu_count(), 14)
seeds = list(range(42, 42 + n_shuffles))
t0 = time.time()
with Pool(n_cpus) as pool:
shuffle_scores = []
for i, score in enumerate(pool.imap_unordered(_zscore_shuffle_worker, seeds)):
shuffle_scores.append(score)
if (i + 1) % 100 == 0:
elapsed = time.time() - t0
eta = elapsed / (i + 1) * (n_shuffles - i - 1)
print(f" {i + 1}/{n_shuffles} done ({elapsed:.0f}s, ~{eta:.0f}s remaining)")
elapsed = time.time() - t0
sm = statistics.mean(shuffle_scores)
ss = statistics.stdev(shuffle_scores)
z = (real - sm) / ss if ss > 0 else 0
beats = sum(1 for s in shuffle_scores if s >= real)
print(f"\n{'=' * 60}")
print(f" Z-SCORE RESULTS (v9, window={_zscore_window}, {n_shuffles} shuffles)")
print(f"{'=' * 60}")
print(f" Real: {real:.6f}")
print(f" Shuffled: {sm:.6f} ± {ss:.6f}")
print(f" Z-score: {z:.2f}")
print(f" Beats: {beats}/{n_shuffles}")
print(f" Time: {elapsed:.1f}s on {n_cpus} cores")
return z
============================================================
VALIDATION TEST
============================================================
def run_validation(roots, freq):
"""Run validation on known words"""
test_cases = [
('להורותם', 'ר', 'Mandatory=ור, Foundation=ר'),
('תורה', 'ר', 'Torah → R'),
('ויחי', 'ח', 'And he lived → Ch'),
('ויצו', 'צ', 'And he commanded → Ts'),
('הזה', 'ז', 'This → Z'),
('הר', 'ר', 'Mountain → R'),
('בראשית', 'ראש', 'In the beginning → R-A-Sh'),
('צוה', 'צ', 'Commanded → Ts'),
('מועד', 'עד', 'Appointed time → A-D'),
('העיר', 'ער', 'The city → A-R'),
('חמשים', 'חמש', 'Fifty → Ch-M-Sh'),
('עמדי', 'עמד', 'My standing → A-M-D'),
('דבר', 'דבר', 'Word → D-B-R'),
('זכר', 'זכר', 'Remember → Z-K-R'),
('יהוה', 'יהוה', 'Sacred Name — protected'),
('איש', 'ש', 'Man → Sh'),
]
print("Validation Test")
print("=" * 70)
passed = 0
failed = 0
for word, expected_core, description in test_cases:
result = extract_root(word, roots, freq)
ok = (result == expected_core or expected_core in result or result in expected_core)
status = "✅" if ok else "❌"
if ok:
passed += 1
else:
failed += 1
print(f" {status} {word:<12} → {result:<10} (expected: {expected_core:<8}) {description}")
print(f"\n Passed: {passed}/{passed + failed}")
return passed, failed
============================================================
MAIN
============================================================
def main():
Load Torah data
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'sefaria_torah.json')
if not os.path.exists(data_path):
print(f"Error: {data_path} not found")
print("Download Torah text from Sefaria.org API first.")
sys.exit(1)
with open(data_path, 'r') as f:
torah_data = json.load(f)
Build dictionary
roots, freq, all_words = build_dictionary(torah_data)
print(f"Root dictionary: {len(roots)} roots (self-bootstrapped from Sefaria.org)")
Parse command line
args = sys.argv[1:]
if not args:
Default: show summary
print(f"Total Torah tokens: {len(all_words)}")
print(f"\nUsage:")
print(f" python3 {sys.argv[0]}
print(f" python3 {sys.argv[0]} --test # validation test")
print(f" python3 {sys.argv[0]} --zscore # Z-score test")
print(f" python3 {sys.argv[0]} --zscore 500 # Z-score with N shuffles")
return
if args[0] == '--test':
run_validation(roots, freq)
elif args[0] == '--zscore':
n = int(args[1]) if len(args) > 1 else 1000
run_zscore_test(torah_data, roots, freq, n_shuffles=n)
else:
Analyze specific words
for word in args:
result = analyze_word(word, roots, freq)
print_analysis(result)
if __name__ == '__main__':
main()
```
רישיון: CC BY 4.0. שימוש, עריכה והפצה חופשיים עם ייחוס.
כלים מקוונים באתר [boundbydesign.org](https://boundbydesign.org): אנלייזר שורשים, מציג תורה, חיפוש שורשים, ו-KosherDNA.
הסקריפט המלא: torah_tree_extractor.py
חילוץ העץ הגניאלוגי השלם מהתורה באמצעות תשעה כללי ניתוח. 340 אנשים, 260 קשתות, מאדם ועד הדור הנכנס לארץ.
```python
#!/usr/bin/env python3
"""
Torah Genealogical Tree Extractor
==================================
Extracts the complete genealogical tree from the Torah text
using nine parsing rules. No parameters, no training data.
Input: sefaria_torah.json (from Sefaria.org API)
Output: Tree with 337 persons, 329 edges, 28 generations
Rules (9 total):
- Patronymic: "X בן Y" → edge (Y → X)
- Birth verb: "ויולד/ותלד את X" → edge (subject → X)
- Naming: "ותקרא שמו X" → node X
- Sons-of: "בני X: A, B, C" → edges (X → A,B,C)
- Father-of: "X אבי Y" → edge (X → Y)
- Tribe: "למטה X" → edge (Jacob → X)
- Name-intro: "ושמו X" → node X
- Daughter-of: "X בת Y" → edge (Y → X)
- Standalone: known entity in text → node registered
Usage:
python3 torah_tree_extractor.py
Author: Eran Eliahu Tuval
License: CC BY 4.0
Data: Sefaria.org API (public domain)
"""
import json, re
from collections import defaultdict
SKIP_WORDS = {
'את', 'אל', 'על', 'כל', 'לא', 'כי', 'גם', 'הוא', 'היא',
'איש', 'אשה', 'בני', 'ואת', 'להם', 'אשר', 'ויהי', 'לו', 'לה',
'בנים', 'בנות', 'שם', 'בית', 'עבד', 'מלך', 'יהוה', 'אלהים',
'שנה', 'שני', 'מאה', 'שלש', 'ארבע', 'חמש', 'שש', 'שבע',
'שמנה', 'תשע', 'עשר', 'שלשים', 'ארבעים', 'חמשים', 'ששים',
'שבעים', 'שמנים', 'תשעים', 'מאת', 'מאות'
}
def clean(text):
text = re.sub(r'[\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7]', '', text)
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'&[^;]+;', '', text)
return text
def words(text):
return [w.strip('\u05c3\u05c0,.;:!?')
for w in clean(text).replace('\u05be', ' ').split()
if w.strip('\u05c3\u05c0,.;:!?')]
def extract_tree(torah_json_path):
with open(torah_json_path, 'r', encoding='utf-8') as f:
torah = json.load(f)
edges = [] # (parent, child, book, chapter, verse, rule)
for book in ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']:
current_subject = None
for ch_num in sorted(torah[book].keys(), key=int):
for v_idx, verse in enumerate(torah[book][ch_num]):
ws = words(verse)
Update current subject: "ויחי X"
for i, w in enumerate(ws):
if w in ('ויחי', 'ויהי') and i+1 < len(ws):
nw = ws[i+1]
if len(nw) >= 2 and nw not in SKIP_WORDS:
current_subject = nw
for i, w in enumerate(ws):
RULE 1: "X בן Y"
if w == 'בן' and i > 0 and i+1 < len(ws):
child, parent = ws[i-1], ws[i+1]
if (len(child) >= 2 and len(parent) >= 2
and child not in SKIP_WORDS
and parent not in SKIP_WORDS):
edges.append((parent, child, book, ch_num, v_idx+1, 'בן'))
RULE 2: "ויולד את X"
if w in ('ויולד', 'ותלד', 'הוליד', 'וילד', 'ילדה'):
for j in range(i+1, min(i+5, len(ws))):
target = ws[j]
if target == 'את' and j+1 < len(ws):
child = ws[j+1]
if len(child) >= 2 and child not in SKIP_WORDS:
parent = None
for k in range(i-1, max(i-4, -1), -1):
if len(ws[k]) >= 2 and ws[k] not in SKIP_WORDS:
parent = ws[k]
break
if not parent:
parent = current_subject
if parent and parent != child:
edges.append((parent, child, book, ch_num, v_idx+1, 'ויולד'))
break
elif target not in ('לו', 'לה', 'עוד'):
if len(target) >= 2 and target not in SKIP_WORDS:
parent = None
for k in range(i-1, max(i-4, -1), -1):
if len(ws[k]) >= 2 and ws[k] not in SKIP_WORDS:
parent = ws[k]
break
if not parent:
parent = current_subject
if parent and parent != target:
edges.append((parent, target, book, ch_num, v_idx+1, 'ויולד'))
break
RULE 3: "ותקרא שמו X"
if w in ('ותקרא', 'ויקרא') and i+2 < len(ws):
if ws[i+1] in ('שמו', 'שמה'):
name = ws[i+2]
if len(name) >= 2 and name not in SKIP_WORDS:
if current_subject:
edges.append((current_subject, name, book, ch_num, v_idx+1, 'קרא_שם'))
Build tree (dedup)
children_of = defaultdict(set)
parent_of = {}
seen = set()
for parent, child, *_ in edges:
if (parent, child) not in seen:
seen.add((parent, child))
children_of[parent].add(child)
if child not in parent_of:
parent_of[child] = parent
all_persons = set()
for p, c in seen:
all_persons.add(p)
all_persons.add(c)
return children_of, parent_of, all_persons, edges
if __name__ == '__main__':
co, po, ap, edges = extract_tree('sefaria_torah.json')
print(f"Persons: {len(ap)}")
print(f"Edges: {len(set((p,c) for p,c,*_ in edges))}")
Longest chain from Adam
def chain(name, visited=None):
if visited is None:
visited = set()
if name in visited:
return [name]
visited.add(name)
if not co.get(name):
return [name]
best = max((chain(c, visited.copy()) for c in co[name]), key=len)
return [name] + best
if 'אדם' in ap:
c = chain('אדם')
print(f"Longest chain: {len(c)} generations")
print(f" {' → '.join(c)}")
```