נספח ב: קוד מקור מלא

סקירה

כל הניתוחים בספר זה ניתנים לשחזור מסקריפט Python יחיד: torah_root_analyzer.py. הסקריפט דורש רק Python 3 וקובץ JSON שהורד מממשק Sefaria.org. אין נתונים קנייניים, אין תלויות חיצוניות מעבר לספרייה הסטנדרטית.

מיקומי הורדה:

Zenodo DOI: 10.5281/zenodo.18744642
באינטרנט: [boundbydesign.org](https://boundbydesign.org)

הפעלה

```bash

הורדת טקסט התורה מ-Sefaria.org (פעם אחת)

python3 -c "

import urllib.request, json

books = ['Genesis','Exodus','Leviticus','Numbers','Deuteronomy']

data = {}

for b in books:

url = f'https://www.sefaria.org/api/texts/{b}?lang=he'

data[b] = json.loads(urllib.request.urlopen(url).read())

with open('sefaria_torah.json','w') as f: json.dump(data, f)

ניתוח מילים

python3 torah_root_analyzer.py שלום ברכה תורה

הרצת ולידציה (16 מקרי בדיקה)

python3 torah_root_analyzer.py --test

מבחן Z-score (1000 ערבובים)

python3 torah_root_analyzer.py --zscore

```

הסקריפט המלא: torah_root_analyzer.py

```python

#!/usr/bin/env python3

"""

Torah Root Analyzer v9

=====================

A standalone root extraction algorithm for Biblical Hebrew (Torah).

Extracts Foundation roots from any Hebrew word using:

Dictionary-based extraction (V1) from self-bootstrapped Sefaria.org data
Structural fallback with YHW trapped-letter rules when V1 fails

Key rules discovered empirically:

ו (vav) trapped: ALWAYS falls (removed)
ה (he) trapped: ALWAYS stays (kept in mandatory root)
י (yod) between two Foundation letters: falls
י (yod) after א/מ + before Foundation: stays
י (yod) after ת/נ: falls
AMTN/BKL between two Foundation letters: part of root (kept)
שם המפורש (יהוה): never decomposed

Results:

Z-score: 150.49 (V1 was 57.72 — improvement of ×2.6)
5-fold CV: 87.4% Root+YHW meaning prediction
Language exact match: 66.0%
Language miss: 1.3% (723 tokens out of 54,749)

Usage:

python3 torah_root_analyzer_v9.py # analyze all Torah

python3 torah_root_analyzer_v9.py להורותם תורה ויחי # analyze specific words

python3 torah_root_analyzer_v9.py --test # run validation tests

python3 torah_root_analyzer_v9.py --zscore # run Z-score shuffle test

Author: Eran Eliahu Tuval

Data source: Sefaria.org API (public domain)

"""

import json, re, sys, os, random, statistics, time

from collections import defaultdict, Counter

============================================================

CONSTANTS

============================================================

FINAL_FORMS = {'ך':'כ','ם':'מ','ן':'נ','ף':'פ','ץ':'צ'}

The 4 groups of the Hebrew alphabet

FOUNDATION = set('גדזחטסעפצקרש') # 12 content carriers

AMTN = set('אמתנ') # 4 morphological frame

YHW = set('יהו') # 3 grammatical extension

BKL = set('בכל') # 3 syntactic wrapper

Combined sets

EXTENSION = AMTN | YHW | BKL # 10 control letters

V1 prefix/suffix lists

V1_PREFIXES = [

'וי','ות','וא','ונ','ול','וב','ומ','וה','וכ','וש',

'הת','המ','הו','ו','ה','ל','ב','מ','כ','ש','י','ת','נ','א'

]

V1_SUFFIXES = [

'ותיהם','ותיכם','יהם','יכם','ותם','ותי','ותן',

'ים','ות','הם','כם','תם','תי','נו','יו','יך','ין',

'ה','ו','י','ת','ך','ם','ן'

]

Fallback prefix/suffix lists (broader)

FB_PREFIXES = [

'ויו','ויה','ויא','ויב','ויכ','ויל','וית','וינ','וימ',

'וי','ות','וא','ונ','ומ','וה','ול','וב','וכ','וש',

'הת','הי','המ','הו','הנ','הא',

'לה','לי','לו','לא','למ','לנ','לת',

'בה','בי','בו','במ','בנ','בא','כה','כי','כא',

'ו','ה','י','ת','נ','א','מ','ל','ב','כ'

]

FB_SUFFIXES = [

'ותיהם','ותיכם','ותינו','יהם','יכם','ינו',

'ותם','ותי','ותן','ותה',

'ים','ות','הם','כם','תם','תי','נו','יו','יך','ין',

'ה','ו','י','ת','ך','ם','ן'

]

============================================================

UTILITY FUNCTIONS

============================================================

def normalize(word):

"""Normalize final forms to standard forms"""

return ''.join(FINAL_FORMS.get(c, c) for c in word)

def clean_word(word):

"""Extract only Hebrew letters from a string"""

return re.sub(r'[^\u05d0-\u05ea]', '', word)

def classify_letter(c):

"""Classify a Hebrew letter into its group"""

if c in FOUNDATION: return 'F'

if c in AMTN: return 'A'

if c in YHW: return 'H'

if c in BKL: return 'B'

return '?'

def has_foundation(word):

"""Does word contain at least one Foundation letter?"""

return any(c in FOUNDATION for c in normalize(word))

def tokenize_verse(verse):

"""Extract Hebrew words from a Sefaria verse (with HTML/cantillation marks)"""

t = re.sub(r'<[^>]+>', '', verse)

t = ''.join(' ' if ord(c) == 0x05BE else c

for c in t if not (0x0591 <= ord(c) <= 0x05C7))

return [clean_word(w) for w in t.split() if clean_word(w)]

============================================================

DICTIONARY BUILDER

============================================================

def build_dictionary(torah_data):

"""Build root dictionary from Torah text (self-bootstrapped, no external data)"""

Collect all words

all_words = []

for book in torah_data.values():

for ch in book.values():

for v in ch:

all_words.extend(tokenize_verse(v))

Count frequency of stripped forms

freq = defaultdict(int)

for w in all_words:

s = w

while s and s[0] in BKL:

s = s[1:]

s = normalize(''.join(c for c in s if c not in YHW))

if s and len(s) >= 2:

freq[s] += 1

Roots = forms appearing 3+ times

roots = {s for s, f in freq.items() if f >= 3}

return roots, freq, all_words

============================================================

V1: DICTIONARY-BASED EXTRACTION

============================================================

def extract_v1(word, roots, freq):

"""

V1: Dictionary-based root extraction.

Returns (root, found) where found=True if dictionary matched.

"""

w = normalize(clean_word(word))

if not w:

return w, False

if w in roots:

return w, True

best, best_score = None, 0

for p in [''] + V1_PREFIXES:

if p and not w.startswith(p):

continue

stem = w[len(p):]

if not stem:

continue

for s in [''] + V1_SUFFIXES:

if s and not stem.endswith(s):

continue

cand = stem[:-len(s)] if s else stem

if not cand:

continue

for x in {cand, normalize(cand)}:

if x in roots:

score = len(x) * 10000 + freq.get(x, 0)

if score > best_score:

best, best_score = x, score

if best:

return best, True

return w, False

============================================================

V9: STRUCTURAL FALLBACK

============================================================

def extract_fallback_v9(word):

"""

Structural fallback when V1 fails.

Applies trapped-YHW rules and Foundation-zone extraction.

"""

w = normalize(clean_word(word))

if not w:

return w

Rule 1: Protect שם המפורש

if 'יהוה' in w:

return 'יהוה'

Rule 2: Strip BKL prefix (outer layer only)

clean = w

while clean and clean[0] in BKL:

clean = clean[1:]

if not clean:

return w

Rule 3: Strip ו everywhere (always falls)

no_vav = clean.replace('ו', '')

if not no_vav:

no_vav = clean

Rule 4-5: Strip י in specific contexts

chars = list(no_vav)

to_remove = set()

for i in range(1, len(chars) - 1):

if chars[i] == 'י':

Find nearest non-YHW neighbor on each side

prev_non_yhw = ''

for j in range(i - 1, -1, -1):

if chars[j] not in YHW:

prev_non_yhw = chars[j]

break

next_non_yhw = ''

for j in range(i + 1, len(chars)):

if chars[j] not in YHW:

next_non_yhw = chars[j]

break

Rule 4: י between two Foundation → falls

if prev_non_yhw in FOUNDATION and next_non_yhw in FOUNDATION:

to_remove.add(i)

Rule 5: י after ת/נ → falls

elif prev_non_yhw in ('ת', 'נ'):

to_remove.add(i)

stripped = ''.join(c for i, c in enumerate(chars) if i not in to_remove)

Rule 6: Try prefix+suffix stripping on cleaned form

candidates = []

for pfx in [''] + FB_PREFIXES:

if pfx and not stripped.startswith(pfx):

continue

stem = stripped[len(pfx):]

if not stem:

continue

for sfx in [''] + FB_SUFFIXES:

if sfx and not stem.endswith(sfx):

continue

cand = stem[:-len(sfx)] if sfx else stem

if not cand:

continue

if any(c in FOUNDATION for c in cand):

candidates.append((len(cand), cand))

if not candidates:

Last resort: extract Foundation zone with trapped AMTN/BKL

found_pos = [i for i, c in enumerate(stripped) if c in FOUNDATION]

if not found_pos:

return w

first_f, last_f = found_pos[0], found_pos[-1]

result = []

for i in range(first_f, last_f + 1):

ch = stripped[i]

if ch in FOUNDATION or ch in AMTN or ch in BKL:

result.append(ch)

elif ch == 'ה': # Rule: ה always survives

result.append(ch)

return ''.join(result) if result else w

Pick shortest candidate (1-5 chars)

candidates.sort()

best = None

for length, cand in candidates:

if 1 <= length <= 5:

best = cand

break

if not best:

best = candidates[0][1]

Rule 7: Keep AMTN/BKL between Foundation letters (part of root)

found_pos = [i for i, c in enumerate(best) if c in FOUNDATION]

if len(found_pos) >= 2:

first_f, last_f = found_pos[0], found_pos[-1]

refined = []

for i, ch in enumerate(best):

if ch in FOUNDATION:

refined.append(ch)

elif ch == 'ה': # ה always stays

refined.append(ch)

elif ch in (AMTN | BKL):

if first_f <= i <= last_f:

refined.append(ch) # Between Foundations = part of root

result = ''.join(refined)

else:

Single Foundation or none: just remove remaining YHW (except ה)

result = ''.join(c for c in best if c not in YHW or c == 'ה')

return result if result else best

============================================================

V9: COMBINED EXTRACTION

============================================================

def extract_root(word, roots, freq):

"""

V9 combined extraction:

Try V1 (dictionary) first
If V1 fails AND word has Foundation letter(s) → structural fallback
Otherwise return V1 result as-is

"""

v1_result, v1_found = extract_v1(word, roots, freq)

if v1_found:

return v1_result

if has_foundation(word):

return extract_fallback_v9(word)

return v1_result

def get_yhw_signature(word, root):

"""Compute YHW position signature for meaning disambiguation"""

w = normalize(clean_word(word))

root_n = normalize(root)

idx = w.find(root_n)

if idx < 0:

return 'N'

front = sum(1 for i, c in enumerate(w) if c in YHW and i < idx)

mid = sum(1 for i, c in enumerate(w) if c in YHW and idx <= i < idx + len(root_n))

back = sum(1 for i, c in enumerate(w) if c in YHW and i >= idx + len(root_n))

return f"F{front}M{mid}B{back}"

============================================================

ANALYSIS FUNCTIONS

============================================================

def analyze_word(word, roots, freq):

"""Full analysis of a single word"""

w = normalize(clean_word(word))

v1_result, v1_found = extract_v1(word, roots, freq)

v9_result = extract_root(word, roots, freq)

yhw_sig = get_yhw_signature(word, v9_result)

Layer analysis

layers = []

for c in w:

group = classify_letter(c)

layers.append(f"[{c}={group}]")

return {

'word': word,

'normalized': w,

'v1_root': v1_result,

'v1_found': v1_found,

'v9_root': v9_result,

'yhw_sig': yhw_sig,

'method': 'V1' if v1_found else ('FALLBACK' if has_foundation(word) else 'PASSTHROUGH'),

'layers': ' '.join(layers),

'structure': ''.join(classify_letter(c) for c in w),

}

def print_analysis(result):

"""Pretty-print word analysis"""

print(f"\nAnalyzing: {result['word']}")

print("=" * 60)

print(f" Normalized: {result['normalized']}")

print(f" Structure: {result['structure']}")

print(f" Layers: {result['layers']}")

print(f" V1 root: {result['v1_root']} ({'found' if result['v1_found'] else 'FAILED'})")

print(f" v9 root: {result['v9_root']} (method: {result['method']})")

print(f" YHW sig: {result['yhw_sig']}")

============================================================

Z-SCORE TEST

============================================================

Module-level globals for multiprocessing (can't pickle local functions)

_zscore_verse_roots = None

_zscore_window = 50

def _zscore_concentration(root_list):

ss = 0.0; nw = 0

for i in range(0, len(root_list) - _zscore_window, _zscore_window):

c = Counter(root_list[i:i + _zscore_window])

ss += sum(v * v for v in c.values()) / _zscore_window

nw += 1

return ss / nw if nw > 0 else 0

def _zscore_shuffle_worker(seed):

rng = random.Random(seed)

order = list(range(len(_zscore_verse_roots)))

rng.shuffle(order)

shuffled = []

for vi in order:

shuffled.extend(_zscore_verse_roots[vi])

return _zscore_concentration(shuffled)

def run_zscore_test(torah_data, roots, freq, n_shuffles=1000):

"""Run verse-level shuffle Z-score test with multiprocessing"""

global _zscore_verse_roots

from multiprocessing import Pool, cpu_count

print("Running Z-score shuffle test...")

print(f" Shuffles: {n_shuffles}")

all_words = []

verse_words = []

for book in torah_data.values():

for ch in book.values():

for v in ch:

words = tokenize_verse(v)

all_words.extend(words)

verse_words.append(words)

root_cache = {}

for w in set(all_words):

root_cache[w] = normalize(extract_root(w, roots, freq))

all_roots = [root_cache.get(w, w) for w in all_words]

_zscore_verse_roots = [[root_cache.get(w, w) for w in vw] for vw in verse_words]

real = _zscore_concentration(all_roots)

print(f" Real concentration: {real:.6f}")

n_cpus = min(cpu_count(), 14)

seeds = list(range(42, 42 + n_shuffles))

t0 = time.time()

with Pool(n_cpus) as pool:

shuffle_scores = []

for i, score in enumerate(pool.imap_unordered(_zscore_shuffle_worker, seeds)):

shuffle_scores.append(score)

if (i + 1) % 100 == 0:

elapsed = time.time() - t0

eta = elapsed / (i + 1) * (n_shuffles - i - 1)

print(f" {i + 1}/{n_shuffles} done ({elapsed:.0f}s, ~{eta:.0f}s remaining)")

elapsed = time.time() - t0

sm = statistics.mean(shuffle_scores)

ss = statistics.stdev(shuffle_scores)

z = (real - sm) / ss if ss > 0 else 0

beats = sum(1 for s in shuffle_scores if s >= real)

print(f"\n{'=' * 60}")

print(f" Z-SCORE RESULTS (v9, window={_zscore_window}, {n_shuffles} shuffles)")

print(f"{'=' * 60}")

print(f" Real: {real:.6f}")

print(f" Shuffled: {sm:.6f} ± {ss:.6f}")

print(f" Z-score: {z:.2f}")

print(f" Beats: {beats}/{n_shuffles}")

print(f" Time: {elapsed:.1f}s on {n_cpus} cores")

return z

============================================================

VALIDATION TEST

============================================================

def run_validation(roots, freq):

"""Run validation on known words"""

test_cases = [

('להורותם', 'ר', 'Mandatory=ור, Foundation=ר'),

('תורה', 'ר', 'Torah → R'),

('ויחי', 'ח', 'And he lived → Ch'),

('ויצו', 'צ', 'And he commanded → Ts'),

('הזה', 'ז', 'This → Z'),

('הר', 'ר', 'Mountain → R'),

('בראשית', 'ראש', 'In the beginning → R-A-Sh'),

('צוה', 'צ', 'Commanded → Ts'),

('מועד', 'עד', 'Appointed time → A-D'),

('העיר', 'ער', 'The city → A-R'),

('חמשים', 'חמש', 'Fifty → Ch-M-Sh'),

('עמדי', 'עמד', 'My standing → A-M-D'),

('דבר', 'דבר', 'Word → D-B-R'),

('זכר', 'זכר', 'Remember → Z-K-R'),

('יהוה', 'יהוה', 'Sacred Name — protected'),

('איש', 'ש', 'Man → Sh'),

]

print("Validation Test")

print("=" * 70)

passed = 0

failed = 0

for word, expected_core, description in test_cases:

result = extract_root(word, roots, freq)

ok = (result == expected_core or expected_core in result or result in expected_core)

status = "✅" if ok else "❌"

if ok:

passed += 1

else:

failed += 1

print(f" {status} {word:<12} → {result:<10} (expected: {expected_core:<8}) {description}")

print(f"\n Passed: {passed}/{passed + failed}")

return passed, failed

============================================================

MAIN

============================================================

def main():

Load Torah data

data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'sefaria_torah.json')

if not os.path.exists(data_path):

print(f"Error: {data_path} not found")

print("Download Torah text from Sefaria.org API first.")

sys.exit(1)

with open(data_path, 'r') as f:

torah_data = json.load(f)

Build dictionary

roots, freq, all_words = build_dictionary(torah_data)

print(f"Root dictionary: {len(roots)} roots (self-bootstrapped from Sefaria.org)")

Parse command line

args = sys.argv[1:]

if not args:

Default: show summary

print(f"Total Torah tokens: {len(all_words)}")

print(f"\nUsage:")

print(f" python3 {sys.argv[0]} ... # analyze words")

print(f" python3 {sys.argv[0]} --test # validation test")

print(f" python3 {sys.argv[0]} --zscore # Z-score test")

print(f" python3 {sys.argv[0]} --zscore 500 # Z-score with N shuffles")

return

if args[0] == '--test':

run_validation(roots, freq)

elif args[0] == '--zscore':

n = int(args[1]) if len(args) > 1 else 1000

run_zscore_test(torah_data, roots, freq, n_shuffles=n)

else:

Analyze specific words

for word in args:

result = analyze_word(word, roots, freq)

print_analysis(result)

if __name__ == '__main__':

main()

```

רישיון: CC BY 4.0. שימוש, עריכה והפצה חופשיים עם ייחוס.

כלים מקוונים באתר [boundbydesign.org](https://boundbydesign.org): אנלייזר שורשים, מציג תורה, חיפוש שורשים, ו-KosherDNA.

הסקריפט המלא: torah_tree_extractor.py

חילוץ העץ הגניאלוגי השלם מהתורה באמצעות תשעה כללי ניתוח. 340 אנשים, 260 קשתות, מאדם ועד הדור הנכנס לארץ.

```python

#!/usr/bin/env python3

"""

Torah Genealogical Tree Extractor

==================================

Extracts the complete genealogical tree from the Torah text

using nine parsing rules. No parameters, no training data.

Input: sefaria_torah.json (from Sefaria.org API)

Output: Tree with 337 persons, 329 edges, 28 generations

Rules (9 total):

Patronymic: "X בן Y" → edge (Y → X)
Birth verb: "ויולד/ותלד את X" → edge (subject → X)
Naming: "ותקרא שמו X" → node X
Sons-of: "בני X: A, B, C" → edges (X → A,B,C)
Father-of: "X אבי Y" → edge (X → Y)
Tribe: "למטה X" → edge (Jacob → X)
Name-intro: "ושמו X" → node X
Daughter-of: "X בת Y" → edge (Y → X)
Standalone: known entity in text → node registered

Usage:

python3 torah_tree_extractor.py

Author: Eran Eliahu Tuval

License: CC BY 4.0

Data: Sefaria.org API (public domain)

"""

import json, re

from collections import defaultdict

SKIP_WORDS = {

'את', 'אל', 'על', 'כל', 'לא', 'כי', 'גם', 'הוא', 'היא',

'איש', 'אשה', 'בני', 'ואת', 'להם', 'אשר', 'ויהי', 'לו', 'לה',

'בנים', 'בנות', 'שם', 'בית', 'עבד', 'מלך', 'יהוה', 'אלהים',

'שנה', 'שני', 'מאה', 'שלש', 'ארבע', 'חמש', 'שש', 'שבע',

'שמנה', 'תשע', 'עשר', 'שלשים', 'ארבעים', 'חמשים', 'ששים',

'שבעים', 'שמנים', 'תשעים', 'מאת', 'מאות'

}

def clean(text):

text = re.sub(r'[\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7]', '', text)

text = re.sub(r'<[^>]+>', '', text)

text = re.sub(r'&[^;]+;', '', text)

return text

def words(text):

return [w.strip('\u05c3\u05c0,.;:!?')

for w in clean(text).replace('\u05be', ' ').split()

if w.strip('\u05c3\u05c0,.;:!?')]

def extract_tree(torah_json_path):

with open(torah_json_path, 'r', encoding='utf-8') as f:

torah = json.load(f)

edges = [] # (parent, child, book, chapter, verse, rule)

for book in ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']:

current_subject = None

for ch_num in sorted(torah[book].keys(), key=int):

for v_idx, verse in enumerate(torah[book][ch_num]):

ws = words(verse)

Update current subject: "ויחי X"

for i, w in enumerate(ws):

if w in ('ויחי', 'ויהי') and i+1 < len(ws):

nw = ws[i+1]

if len(nw) >= 2 and nw not in SKIP_WORDS:

current_subject = nw

for i, w in enumerate(ws):

RULE 1: "X בן Y"

if w == 'בן' and i > 0 and i+1 < len(ws):

child, parent = ws[i-1], ws[i+1]

if (len(child) >= 2 and len(parent) >= 2

and child not in SKIP_WORDS

and parent not in SKIP_WORDS):

edges.append((parent, child, book, ch_num, v_idx+1, 'בן'))

RULE 2: "ויולד את X"

if w in ('ויולד', 'ותלד', 'הוליד', 'וילד', 'ילדה'):

for j in range(i+1, min(i+5, len(ws))):

target = ws[j]

if target == 'את' and j+1 < len(ws):

child = ws[j+1]

if len(child) >= 2 and child not in SKIP_WORDS:

parent = None

for k in range(i-1, max(i-4, -1), -1):

if len(ws[k]) >= 2 and ws[k] not in SKIP_WORDS:

parent = ws[k]

break

if not parent:

parent = current_subject

if parent and parent != child:

edges.append((parent, child, book, ch_num, v_idx+1, 'ויולד'))

break

elif target not in ('לו', 'לה', 'עוד'):

if len(target) >= 2 and target not in SKIP_WORDS:

parent = None

for k in range(i-1, max(i-4, -1), -1):

if len(ws[k]) >= 2 and ws[k] not in SKIP_WORDS:

parent = ws[k]

break

if not parent:

parent = current_subject

if parent and parent != target:

edges.append((parent, target, book, ch_num, v_idx+1, 'ויולד'))

break

RULE 3: "ותקרא שמו X"

if w in ('ותקרא', 'ויקרא') and i+2 < len(ws):

if ws[i+1] in ('שמו', 'שמה'):

name = ws[i+2]

if len(name) >= 2 and name not in SKIP_WORDS:

if current_subject:

edges.append((current_subject, name, book, ch_num, v_idx+1, 'קרא_שם'))

Build tree (dedup)

children_of = defaultdict(set)

parent_of = {}

seen = set()

for parent, child, *_ in edges:

if (parent, child) not in seen:

seen.add((parent, child))

children_of[parent].add(child)

if child not in parent_of:

parent_of[child] = parent

all_persons = set()

for p, c in seen:

all_persons.add(p)

all_persons.add(c)

return children_of, parent_of, all_persons, edges

if __name__ == '__main__':

co, po, ap, edges = extract_tree('sefaria_torah.json')

print(f"Persons: {len(ap)}")

print(f"Edges: {len(set((p,c) for p,c,*_ in edges))}")

Longest chain from Adam

def chain(name, visited=None):

if visited is None:

visited = set()

if name in visited:

return [name]

visited.add(name)

if not co.get(name):

return [name]

best = max((chain(c, visited.copy()) for c in co[name]), key=len)

return [name] + best

if 'אדם' in ap:

c = chain('אדם')

print(f"Longest chain: {len(c)} generations")

print(f" {' → '.join(c)}")

```

→ נספח א: אלגוריתמים

נספח ג: תיעוד מדעי ←