Appendix B: Complete Source Code

Overview

All analyses in this book are reproducible from a single Python script: torah_root_analyzer.py. The script requires only Python 3 and a JSON file downloaded from the Sefaria.org API. No proprietary data, no external dependencies beyond the standard library.

Download locations:

How to Run

```bash

Download Torah text from Sefaria.org (one-time)

python3 -c "

import urllib.request, json

books = ['Genesis','Exodus','Leviticus','Numbers','Deuteronomy']

data = {}

for b in books:

url = f'https://www.sefaria.org/api/texts/{b}?lang=he'

data[b] = json.loads(urllib.request.urlopen(url).read())

with open('sefaria_torah.json','w') as f: json.dump(data, f)

"

Analyze words

python3 torah_root_analyzer.py ืฉืœื•ื ื‘ืจื›ื” ืชื•ืจื”

Run validation (16 test cases)

python3 torah_root_analyzer.py --test

Run Z-score shuffle test (default 1000 shuffles)

python3 torah_root_analyzer.py --zscore

```

Complete Script: torah_root_analyzer.py

```python

#!/usr/bin/env python3

"""

Torah Root Analyzer v9

=====================

A standalone root extraction algorithm for Biblical Hebrew (Torah).

Extracts Foundation roots from any Hebrew word using:

  1. Dictionary-based extraction (V1) from self-bootstrapped Sefaria.org data
  2. Structural fallback with YHW trapped-letter rules when V1 fails

Key rules discovered empirically:

Results:

Usage:

python3 torah_root_analyzer_v9.py # analyze all Torah

python3 torah_root_analyzer_v9.py ืœื”ื•ืจื•ืชื ืชื•ืจื” ื•ื™ื—ื™ # analyze specific words

python3 torah_root_analyzer_v9.py --test # run validation tests

python3 torah_root_analyzer_v9.py --zscore # run Z-score shuffle test

Author: Eran Eliahu Tuval

Data source: Sefaria.org API (public domain)

"""

import json, re, sys, os, random, statistics, time

from collections import defaultdict, Counter

============================================================

CONSTANTS

============================================================

FINAL_FORMS = {'ืš':'ื›','ื':'ืž','ืŸ':'ื ','ืฃ':'ืค','ืฅ':'ืฆ'}

The 4 groups of the Hebrew alphabet

FOUNDATION = set('ื’ื“ื–ื—ื˜ืกืขืคืฆืงืจืฉ') # 12 content carriers

AMTN = set('ืืžืชื ') # 4 morphological frame

YHW = set('ื™ื”ื•') # 3 grammatical extension

BKL = set('ื‘ื›ืœ') # 3 syntactic wrapper

Combined sets

EXTENSION = AMTN | YHW | BKL # 10 control letters

V1 prefix/suffix lists

V1_PREFIXES = [

'ื•ื™','ื•ืช','ื•ื','ื•ื ','ื•ืœ','ื•ื‘','ื•ืž','ื•ื”','ื•ื›','ื•ืฉ',

'ื”ืช','ื”ืž','ื”ื•','ื•','ื”','ืœ','ื‘','ืž','ื›','ืฉ','ื™','ืช','ื ','ื'

]

V1_SUFFIXES = [

'ื•ืชื™ื”ื','ื•ืชื™ื›ื','ื™ื”ื','ื™ื›ื','ื•ืชื','ื•ืชื™','ื•ืชืŸ',

'ื™ื','ื•ืช','ื”ื','ื›ื','ืชื','ืชื™','ื ื•','ื™ื•','ื™ืš','ื™ืŸ',

'ื”','ื•','ื™','ืช','ืš','ื','ืŸ'

]

Fallback prefix/suffix lists (broader)

FB_PREFIXES = [

'ื•ื™ื•','ื•ื™ื”','ื•ื™ื','ื•ื™ื‘','ื•ื™ื›','ื•ื™ืœ','ื•ื™ืช','ื•ื™ื ','ื•ื™ืž',

'ื•ื™','ื•ืช','ื•ื','ื•ื ','ื•ืž','ื•ื”','ื•ืœ','ื•ื‘','ื•ื›','ื•ืฉ',

'ื”ืช','ื”ื™','ื”ืž','ื”ื•','ื”ื ','ื”ื',

'ืœื”','ืœื™','ืœื•','ืœื','ืœืž','ืœื ','ืœืช',

'ื‘ื”','ื‘ื™','ื‘ื•','ื‘ืž','ื‘ื ','ื‘ื','ื›ื”','ื›ื™','ื›ื',

'ื•','ื”','ื™','ืช','ื ','ื','ืž','ืœ','ื‘','ื›'

]

FB_SUFFIXES = [

'ื•ืชื™ื”ื','ื•ืชื™ื›ื','ื•ืชื™ื ื•','ื™ื”ื','ื™ื›ื','ื™ื ื•',

'ื•ืชื','ื•ืชื™','ื•ืชืŸ','ื•ืชื”',

'ื™ื','ื•ืช','ื”ื','ื›ื','ืชื','ืชื™','ื ื•','ื™ื•','ื™ืš','ื™ืŸ',

'ื”','ื•','ื™','ืช','ืš','ื','ืŸ'

]

============================================================

UTILITY FUNCTIONS

============================================================

def normalize(word):

"""Normalize final forms to standard forms"""

return ''.join(FINAL_FORMS.get(c, c) for c in word)

def clean_word(word):

"""Extract only Hebrew letters from a string"""

return re.sub(r'[^\u05d0-\u05ea]', '', word)

def classify_letter(c):

"""Classify a Hebrew letter into its group"""

if c in FOUNDATION: return 'F'

if c in AMTN: return 'A'

if c in YHW: return 'H'

if c in BKL: return 'B'

return '?'

def has_foundation(word):

"""Does word contain at least one Foundation letter?"""

return any(c in FOUNDATION for c in normalize(word))

def tokenize_verse(verse):

"""Extract Hebrew words from a Sefaria verse (with HTML/cantillation marks)"""

t = re.sub(r'<[^>]+>', '', verse)

t = ''.join(' ' if ord(c) == 0x05BE else c

for c in t if not (0x0591 <= ord(c) <= 0x05C7))

return [clean_word(w) for w in t.split() if clean_word(w)]

============================================================

DICTIONARY BUILDER

============================================================

def build_dictionary(torah_data):

"""Build root dictionary from Torah text (self-bootstrapped, no external data)"""

Collect all words

all_words = []

for book in torah_data.values():

for ch in book.values():

for v in ch:

all_words.extend(tokenize_verse(v))

Count frequency of stripped forms

freq = defaultdict(int)

for w in all_words:

s = w

while s and s[0] in BKL:

s = s[1:]

s = normalize(''.join(c for c in s if c not in YHW))

if s and len(s) >= 2:

freq[s] += 1

Roots = forms appearing 3+ times

roots = {s for s, f in freq.items() if f >= 3}

return roots, freq, all_words

============================================================

V1: DICTIONARY-BASED EXTRACTION

============================================================

def extract_v1(word, roots, freq):

"""

V1: Dictionary-based root extraction.

Returns (root, found) where found=True if dictionary matched.

"""

w = normalize(clean_word(word))

if not w:

return w, False

if w in roots:

return w, True

best, best_score = None, 0

for p in [''] + V1_PREFIXES:

if p and not w.startswith(p):

continue

stem = w[len(p):]

if not stem:

continue

for s in [''] + V1_SUFFIXES:

if s and not stem.endswith(s):

continue

cand = stem[:-len(s)] if s else stem

if not cand:

continue

for x in {cand, normalize(cand)}:

if x in roots:

score = len(x) * 10000 + freq.get(x, 0)

if score > best_score:

best, best_score = x, score

if best:

return best, True

return w, False

============================================================

V9: STRUCTURAL FALLBACK

============================================================

def extract_fallback_v9(word):

"""

Structural fallback when V1 fails.

Applies trapped-YHW rules and Foundation-zone extraction.

"""

w = normalize(clean_word(word))

if not w:

return w

Rule 1: Protect ืฉื ื”ืžืคื•ืจืฉ

if 'ื™ื”ื•ื”' in w:

return 'ื™ื”ื•ื”'

Rule 2: Strip BKL prefix (outer layer only)

clean = w

while clean and clean[0] in BKL:

clean = clean[1:]

if not clean:

return w

Rule 3: Strip ื• everywhere (always falls)

no_vav = clean.replace('ื•', '')

if not no_vav:

no_vav = clean

Rule 4-5: Strip ื™ in specific contexts

chars = list(no_vav)

to_remove = set()

for i in range(1, len(chars) - 1):

if chars[i] == 'ื™':

Find nearest non-YHW neighbor on each side

prev_non_yhw = ''

for j in range(i - 1, -1, -1):

if chars[j] not in YHW:

prev_non_yhw = chars[j]

break

next_non_yhw = ''

for j in range(i + 1, len(chars)):

if chars[j] not in YHW:

next_non_yhw = chars[j]

break

Rule 4: ื™ between two Foundation โ†’ falls

if prev_non_yhw in FOUNDATION and next_non_yhw in FOUNDATION:

to_remove.add(i)

Rule 5: ื™ after ืช/ื  โ†’ falls

elif prev_non_yhw in ('ืช', 'ื '):

to_remove.add(i)

stripped = ''.join(c for i, c in enumerate(chars) if i not in to_remove)

Rule 6: Try prefix+suffix stripping on cleaned form

candidates = []

for pfx in [''] + FB_PREFIXES:

if pfx and not stripped.startswith(pfx):

continue

stem = stripped[len(pfx):]

if not stem:

continue

for sfx in [''] + FB_SUFFIXES:

if sfx and not stem.endswith(sfx):

continue

cand = stem[:-len(sfx)] if sfx else stem

if not cand:

continue

if any(c in FOUNDATION for c in cand):

candidates.append((len(cand), cand))

if not candidates:

Last resort: extract Foundation zone with trapped AMTN/BKL

found_pos = [i for i, c in enumerate(stripped) if c in FOUNDATION]

if not found_pos:

return w

first_f, last_f = found_pos[0], found_pos[-1]

result = []

for i in range(first_f, last_f + 1):

ch = stripped[i]

if ch in FOUNDATION or ch in AMTN or ch in BKL:

result.append(ch)

elif ch == 'ื”': # Rule: ื” always survives

result.append(ch)

return ''.join(result) if result else w

Pick shortest candidate (1-5 chars)

candidates.sort()

best = None

for length, cand in candidates:

if 1 <= length <= 5:

best = cand

break

if not best:

best = candidates[0][1]

Rule 7: Keep AMTN/BKL between Foundation letters (part of root)

found_pos = [i for i, c in enumerate(best) if c in FOUNDATION]

if len(found_pos) >= 2:

first_f, last_f = found_pos[0], found_pos[-1]

refined = []

for i, ch in enumerate(best):

if ch in FOUNDATION:

refined.append(ch)

elif ch == 'ื”': # ื” always stays

refined.append(ch)

elif ch in (AMTN | BKL):

if first_f <= i <= last_f:

refined.append(ch) # Between Foundations = part of root

result = ''.join(refined)

else:

Single Foundation or none: just remove remaining YHW (except ื”)

result = ''.join(c for c in best if c not in YHW or c == 'ื”')

return result if result else best

============================================================

V9: COMBINED EXTRACTION

============================================================

def extract_root(word, roots, freq):

"""

V9 combined extraction:

  1. Try V1 (dictionary) first
  2. If V1 fails AND word has Foundation letter(s) โ†’ structural fallback
  3. Otherwise return V1 result as-is

"""

v1_result, v1_found = extract_v1(word, roots, freq)

if v1_found:

return v1_result

if has_foundation(word):

return extract_fallback_v9(word)

return v1_result

def get_yhw_signature(word, root):

"""Compute YHW position signature for meaning disambiguation"""

w = normalize(clean_word(word))

root_n = normalize(root)

idx = w.find(root_n)

if idx < 0:

return 'N'

front = sum(1 for i, c in enumerate(w) if c in YHW and i < idx)

mid = sum(1 for i, c in enumerate(w) if c in YHW and idx <= i < idx + len(root_n))

back = sum(1 for i, c in enumerate(w) if c in YHW and i >= idx + len(root_n))

return f"F{front}M{mid}B{back}"

============================================================

ANALYSIS FUNCTIONS

============================================================

def analyze_word(word, roots, freq):

"""Full analysis of a single word"""

w = normalize(clean_word(word))

v1_result, v1_found = extract_v1(word, roots, freq)

v9_result = extract_root(word, roots, freq)

yhw_sig = get_yhw_signature(word, v9_result)

Layer analysis

layers = []

for c in w:

group = classify_letter(c)

layers.append(f"[{c}={group}]")

return {

'word': word,

'normalized': w,

'v1_root': v1_result,

'v1_found': v1_found,

'v9_root': v9_result,

'yhw_sig': yhw_sig,

'method': 'V1' if v1_found else ('FALLBACK' if has_foundation(word) else 'PASSTHROUGH'),

'layers': ' '.join(layers),

'structure': ''.join(classify_letter(c) for c in w),

}

def print_analysis(result):

"""Pretty-print word analysis"""

print(f"\nAnalyzing: {result['word']}")

print("=" * 60)

print(f" Normalized: {result['normalized']}")

print(f" Structure: {result['structure']}")

print(f" Layers: {result['layers']}")

print(f" V1 root: {result['v1_root']} ({'found' if result['v1_found'] else 'FAILED'})")

print(f" v9 root: {result['v9_root']} (method: {result['method']})")

print(f" YHW sig: {result['yhw_sig']}")

============================================================

Z-SCORE TEST

============================================================

Module-level globals for multiprocessing (can't pickle local functions)

_zscore_verse_roots = None

_zscore_window = 50

def _zscore_concentration(root_list):

ss = 0.0; nw = 0

for i in range(0, len(root_list) - _zscore_window, _zscore_window):

c = Counter(root_list[i:i + _zscore_window])

ss += sum(v * v for v in c.values()) / _zscore_window

nw += 1

return ss / nw if nw > 0 else 0

def _zscore_shuffle_worker(seed):

rng = random.Random(seed)

order = list(range(len(_zscore_verse_roots)))

rng.shuffle(order)

shuffled = []

for vi in order:

shuffled.extend(_zscore_verse_roots[vi])

return _zscore_concentration(shuffled)

def run_zscore_test(torah_data, roots, freq, n_shuffles=1000):

"""Run verse-level shuffle Z-score test with multiprocessing"""

global _zscore_verse_roots

from multiprocessing import Pool, cpu_count

print("Running Z-score shuffle test...")

print(f" Shuffles: {n_shuffles}")

all_words = []

verse_words = []

for book in torah_data.values():

for ch in book.values():

for v in ch:

words = tokenize_verse(v)

all_words.extend(words)

verse_words.append(words)

root_cache = {}

for w in set(all_words):

root_cache[w] = normalize(extract_root(w, roots, freq))

all_roots = [root_cache.get(w, w) for w in all_words]

_zscore_verse_roots = [[root_cache.get(w, w) for w in vw] for vw in verse_words]

real = _zscore_concentration(all_roots)

print(f" Real concentration: {real:.6f}")

n_cpus = min(cpu_count(), 14)

seeds = list(range(42, 42 + n_shuffles))

t0 = time.time()

with Pool(n_cpus) as pool:

shuffle_scores = []

for i, score in enumerate(pool.imap_unordered(_zscore_shuffle_worker, seeds)):

shuffle_scores.append(score)

if (i + 1) % 100 == 0:

elapsed = time.time() - t0

eta = elapsed / (i + 1) * (n_shuffles - i - 1)

print(f" {i + 1}/{n_shuffles} done ({elapsed:.0f}s, ~{eta:.0f}s remaining)")

elapsed = time.time() - t0

sm = statistics.mean(shuffle_scores)

ss = statistics.stdev(shuffle_scores)

z = (real - sm) / ss if ss > 0 else 0

beats = sum(1 for s in shuffle_scores if s >= real)

print(f"\n{'=' * 60}")

print(f" Z-SCORE RESULTS (v9, window={_zscore_window}, {n_shuffles} shuffles)")

print(f"{'=' * 60}")

print(f" Real: {real:.6f}")

print(f" Shuffled: {sm:.6f} ยฑ {ss:.6f}")

print(f" Z-score: {z:.2f}")

print(f" Beats: {beats}/{n_shuffles}")

print(f" Time: {elapsed:.1f}s on {n_cpus} cores")

return z

============================================================

VALIDATION TEST

============================================================

def run_validation(roots, freq):

"""Run validation on known words"""

test_cases = [

('ืœื”ื•ืจื•ืชื', 'ืจ', 'Mandatory=ื•ืจ, Foundation=ืจ'),

('ืชื•ืจื”', 'ืจ', 'Torah โ†’ R'),

('ื•ื™ื—ื™', 'ื—', 'And he lived โ†’ Ch'),

('ื•ื™ืฆื•', 'ืฆ', 'And he commanded โ†’ Ts'),

('ื”ื–ื”', 'ื–', 'This โ†’ Z'),

('ื”ืจ', 'ืจ', 'Mountain โ†’ R'),

('ื‘ืจืืฉื™ืช', 'ืจืืฉ', 'In the beginning โ†’ R-A-Sh'),

('ืฆื•ื”', 'ืฆ', 'Commanded โ†’ Ts'),

('ืžื•ืขื“', 'ืขื“', 'Appointed time โ†’ A-D'),

('ื”ืขื™ืจ', 'ืขืจ', 'The city โ†’ A-R'),

('ื—ืžืฉื™ื', 'ื—ืžืฉ', 'Fifty โ†’ Ch-M-Sh'),

('ืขืžื“ื™', 'ืขืžื“', 'My standing โ†’ A-M-D'),

('ื“ื‘ืจ', 'ื“ื‘ืจ', 'Word โ†’ D-B-R'),

('ื–ื›ืจ', 'ื–ื›ืจ', 'Remember โ†’ Z-K-R'),

('ื™ื”ื•ื”', 'ื™ื”ื•ื”', 'Sacred Name โ€” protected'),

('ืื™ืฉ', 'ืฉ', 'Man โ†’ Sh'),

]

print("Validation Test")

print("=" * 70)

passed = 0

failed = 0

for word, expected_core, description in test_cases:

result = extract_root(word, roots, freq)

ok = (result == expected_core or expected_core in result or result in expected_core)

status = "โœ…" if ok else "โŒ"

if ok:

passed += 1

else:

failed += 1

print(f" {status} {word:<12} โ†’ {result:<10} (expected: {expected_core:<8}) {description}")

print(f"\n Passed: {passed}/{passed + failed}")

return passed, failed

============================================================

MAIN

============================================================

def main():

Load Torah data

data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'sefaria_torah.json')

if not os.path.exists(data_path):

print(f"Error: {data_path} not found")

print("Download Torah text from Sefaria.org API first.")

sys.exit(1)

with open(data_path, 'r') as f:

torah_data = json.load(f)

Build dictionary

roots, freq, all_words = build_dictionary(torah_data)

print(f"Root dictionary: {len(roots)} roots (self-bootstrapped from Sefaria.org)")

Parse command line

args = sys.argv[1:]

if not args:

Default: show summary

print(f"Total Torah tokens: {len(all_words)}")

print(f"\nUsage:")

print(f" python3 {sys.argv[0]} ... # analyze words")

print(f" python3 {sys.argv[0]} --test # validation test")

print(f" python3 {sys.argv[0]} --zscore # Z-score test")

print(f" python3 {sys.argv[0]} --zscore 500 # Z-score with N shuffles")

return

if args[0] == '--test':

run_validation(roots, freq)

elif args[0] == '--zscore':

n = int(args[1]) if len(args) > 1 else 1000

run_zscore_test(torah_data, roots, freq, n_shuffles=n)

else:

Analyze specific words

for word in args:

result = analyze_word(word, roots, freq)

print_analysis(result)

if __name__ == '__main__':

main()

```


License: CC BY 4.0. Use, modify, and distribute freely with attribution.

Online tools at [boundbydesign.org](https://boundbydesign.org): Interactive root analyzer, Torah visualizer, root search engine, and KosherDNA classifier.


Complete Script: torah_tree_extractor.py

Extracts the complete genealogical tree from the Torah using nine parsing rules. 340 persons, 260 edges, from Adam to the generation entering the Land.

```python

#!/usr/bin/env python3

"""

Torah Genealogical Tree Extractor

==================================

Extracts the complete genealogical tree from the Torah text

using nine parsing rules. No parameters, no training data.

Input: sefaria_torah.json (from Sefaria.org API)

Output: Tree with 337 persons, 329 edges, 28 generations

Rules (9 total):

  1. Patronymic: "X ื‘ืŸ Y" โ†’ edge (Y โ†’ X)
  2. Birth verb: "ื•ื™ื•ืœื“/ื•ืชืœื“ ืืช X" โ†’ edge (subject โ†’ X)
  3. Naming: "ื•ืชืงืจื ืฉืžื• X" โ†’ node X
  4. Sons-of: "ื‘ื ื™ X: A, B, C" โ†’ edges (X โ†’ A,B,C)
  5. Father-of: "X ืื‘ื™ Y" โ†’ edge (X โ†’ Y)
  6. Tribe: "ืœืžื˜ื” X" โ†’ edge (Jacob โ†’ X)
  7. Name-intro: "ื•ืฉืžื• X" โ†’ node X
  8. Daughter-of: "X ื‘ืช Y" โ†’ edge (Y โ†’ X)
  9. Standalone: known entity in text โ†’ node registered

Usage:

python3 torah_tree_extractor.py

Author: Eran Eliahu Tuval

License: CC BY 4.0

Data: Sefaria.org API (public domain)

"""

import json, re

from collections import defaultdict

SKIP_WORDS = {

'ืืช', 'ืืœ', 'ืขืœ', 'ื›ืœ', 'ืœื', 'ื›ื™', 'ื’ื', 'ื”ื•ื', 'ื”ื™ื',

'ืื™ืฉ', 'ืืฉื”', 'ื‘ื ื™', 'ื•ืืช', 'ืœื”ื', 'ืืฉืจ', 'ื•ื™ื”ื™', 'ืœื•', 'ืœื”',

'ื‘ื ื™ื', 'ื‘ื ื•ืช', 'ืฉื', 'ื‘ื™ืช', 'ืขื‘ื“', 'ืžืœืš', 'ื™ื”ื•ื”', 'ืืœื”ื™ื',

'ืฉื ื”', 'ืฉื ื™', 'ืžืื”', 'ืฉืœืฉ', 'ืืจื‘ืข', 'ื—ืžืฉ', 'ืฉืฉ', 'ืฉื‘ืข',

'ืฉืžื ื”', 'ืชืฉืข', 'ืขืฉืจ', 'ืฉืœืฉื™ื', 'ืืจื‘ืขื™ื', 'ื—ืžืฉื™ื', 'ืฉืฉื™ื',

'ืฉื‘ืขื™ื', 'ืฉืžื ื™ื', 'ืชืฉืขื™ื', 'ืžืืช', 'ืžืื•ืช'

}

def clean(text):

text = re.sub(r'[\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7]', '', text)

text = re.sub(r'<[^>]+>', '', text)

text = re.sub(r'&[^;]+;', '', text)

return text

def words(text):

return [w.strip('\u05c3\u05c0,.;:!?')

for w in clean(text).replace('\u05be', ' ').split()

if w.strip('\u05c3\u05c0,.;:!?')]

def extract_tree(torah_json_path):

with open(torah_json_path, 'r', encoding='utf-8') as f:

torah = json.load(f)

edges = [] # (parent, child, book, chapter, verse, rule)

for book in ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']:

current_subject = None

for ch_num in sorted(torah[book].keys(), key=int):

for v_idx, verse in enumerate(torah[book][ch_num]):

ws = words(verse)

Update current subject: "ื•ื™ื—ื™ X"

for i, w in enumerate(ws):

if w in ('ื•ื™ื—ื™', 'ื•ื™ื”ื™') and i+1 < len(ws):

nw = ws[i+1]

if len(nw) >= 2 and nw not in SKIP_WORDS:

current_subject = nw

for i, w in enumerate(ws):

RULE 1: "X ื‘ืŸ Y"

if w == 'ื‘ืŸ' and i > 0 and i+1 < len(ws):

child, parent = ws[i-1], ws[i+1]

if (len(child) >= 2 and len(parent) >= 2

and child not in SKIP_WORDS

and parent not in SKIP_WORDS):

edges.append((parent, child, book, ch_num, v_idx+1, 'ื‘ืŸ'))

RULE 2: "ื•ื™ื•ืœื“ ืืช X"

if w in ('ื•ื™ื•ืœื“', 'ื•ืชืœื“', 'ื”ื•ืœื™ื“', 'ื•ื™ืœื“', 'ื™ืœื“ื”'):

for j in range(i+1, min(i+5, len(ws))):

target = ws[j]

if target == 'ืืช' and j+1 < len(ws):

child = ws[j+1]

if len(child) >= 2 and child not in SKIP_WORDS:

parent = None

for k in range(i-1, max(i-4, -1), -1):

if len(ws[k]) >= 2 and ws[k] not in SKIP_WORDS:

parent = ws[k]

break

if not parent:

parent = current_subject

if parent and parent != child:

edges.append((parent, child, book, ch_num, v_idx+1, 'ื•ื™ื•ืœื“'))

break

elif target not in ('ืœื•', 'ืœื”', 'ืขื•ื“'):

if len(target) >= 2 and target not in SKIP_WORDS:

parent = None

for k in range(i-1, max(i-4, -1), -1):

if len(ws[k]) >= 2 and ws[k] not in SKIP_WORDS:

parent = ws[k]

break

if not parent:

parent = current_subject

if parent and parent != target:

edges.append((parent, target, book, ch_num, v_idx+1, 'ื•ื™ื•ืœื“'))

break

RULE 3: "ื•ืชืงืจื ืฉืžื• X"

if w in ('ื•ืชืงืจื', 'ื•ื™ืงืจื') and i+2 < len(ws):

if ws[i+1] in ('ืฉืžื•', 'ืฉืžื”'):

name = ws[i+2]

if len(name) >= 2 and name not in SKIP_WORDS:

if current_subject:

edges.append((current_subject, name, book, ch_num, v_idx+1, 'ืงืจื_ืฉื'))

Build tree (dedup)

children_of = defaultdict(set)

parent_of = {}

seen = set()

for parent, child, *_ in edges:

if (parent, child) not in seen:

seen.add((parent, child))

children_of[parent].add(child)

if child not in parent_of:

parent_of[child] = parent

all_persons = set()

for p, c in seen:

all_persons.add(p)

all_persons.add(c)

return children_of, parent_of, all_persons, edges

if __name__ == '__main__':

co, po, ap, edges = extract_tree('sefaria_torah.json')

print(f"Persons: {len(ap)}")

print(f"Edges: {len(set((p,c) for p,c,*_ in edges))}")

Longest chain from Adam

def chain(name, visited=None):

if visited is None:

visited = set()

if name in visited:

return [name]

visited.add(name)

if not co.get(name):

return [name]

best = max((chain(c, visited.copy()) for c in co[name]), key=len)

return [name] + best

if 'ืื“ื' in ap:

c = chain('ืื“ื')

print(f"Longest chain: {len(c)} generations")

print(f" {' โ†’ '.join(c)}")

```