Skip to content

Python API

lanctools exports two classes for working with local ancestry data. LancData contains the genotype and local ancestry data for a set of plink2 .pgen and .lanc files, together with efficient methods for querying this data. FlatLanc is the core data structure which stores local ancestry data in a flattened structure.

lanctools.LancData

The genotype and local ancestry data for a single chromosome/dataset.

Attributes:

Name Type Description
pgen PgenReader

A pgenlib PgenReader object.

pvar PvarReader

A pgenlib PVarReader object.

lanc FlatLanc

A FlatLanc object with local ancestry data.

ancestries list[str]

An ordered list of ancestry names. The integer codes in the .lanc file and self.lanc correspond to indices in this list (e.g. 0 -> ancestries[0]).

plink_prefix str

The prefix for the corresponding plink2 fileset.

Source code in src/lanctools/core.py
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
class LancData:
    """The genotype and local ancestry data for a single chromosome/dataset.

    Attributes:
        pgen (PgenReader): A pgenlib PgenReader object.
        pvar (PvarReader): A pgenlib PVarReader object.
        lanc (FlatLanc): A FlatLanc object with local ancestry data.
        ancestries (list[str]): An ordered list of ancestry names. The integer codes in
            the .lanc file and `self.lanc` correspond to indices in this list (e.g.
            0 -> ancestries[0]).
        plink_prefix (str): The prefix for the corresponding plink2 fileset.
    """

    def __init__(
        self,
        plink_prefix: str,
        lanc_file: str,
        ancestries: Optional[list[str]] = None,
    ):
        """Constructs a LancData from plink2 files.

        Args:
            plink_prefix (str): The prefix for a plink2 fileset.
            lanc_file (str): The path to a .lanc file.
            ancestries (Optional[list[str]): An optional list of ordered ancestry names corresponding to the .lanc file.
        """
        pgen = PgenReader(bytes(plink_prefix + ".pgen", "utf8"))
        pvar = PvarReader(bytes(plink_prefix + ".pvar", "utf8"))
        lanc = _read_lanc(lanc_file)

        if ancestries is None:
            all_values = np.concatenate([lanc.left_haps, lanc.right_haps])
            ancestries = [str(i) for i in np.unique(all_values)]

        self.pgen = pgen
        self.pvar = pvar
        self.lanc = lanc
        self.ancestries = ancestries
        self.plink_prefix = plink_prefix

    def get_info(self, indices: NDArray[np.uint32]) -> DataFrame:
        """Query info for a set of variants.

        Args:
            indices: The variant indices in pvar order (0-based), shape (V,)

        Returns:
            pandas.DataFrame: One row per variant with the following columns:

                - CHR (str): Chromosome name. \n
                - BP (int): 1-based genomic position. \n
                - REF (str): Reference allele. \n
                - ALT (str): Alternate allele. \n
                - ID (str): Variant identifier. \n
        """

        return _get_info(self.pvar, indices)

    def get_lanc(self, indices: NDArray[np.unsignedinteger]) -> NDArray[np.uint8]:
        """Query phased local ancestry.

        Args:
            indices: The variant indices in pvar order (0-based), shape (V,)

        Returns:
            An array of ancestries, shape (N, V, 2)
        """

        left, right = _get_lanc(
            self.lanc.left_haps,
            self.lanc.right_haps,
            self.lanc.breakpoints,
            self.lanc.offsets,
            indices,
        )
        return np.stack((left, right), axis=-1)

    def get_lanc_dosage(self, indices: NDArray[np.uint32]) -> NDArray[np.uint8]:
        """Query local ancestry dosage.

        Args:
            indices: An array of variant indices in pvar order (0-based), shape (V,)

        Returns:
            An array of local ancestry dosages, shape (N, V, K) (where K is the
                number of ancestries)
        """

        lanc = np.asarray(self.get_lanc(indices), dtype=np.uint8)
        ancestries = np.arange(len(self.ancestries), dtype=np.uint8)
        left_haps_mask = (lanc[:, :, 0:1] == ancestries[None, None, :]).astype(np.int32)
        right_haps_mask = (lanc[:, :, 1:2] == ancestries[None, None, :]).astype(
            np.int32
        )
        return left_haps_mask + right_haps_mask

    def get_geno(self, indices: NDArray[np.uint32]) -> NDArray[np.int32]:
        """Query phased genotypes.

        Args:
            indices: An array of variant indices in pvar order (0-based), shape (V,)

        Returns:
            An array of phased genotypes, shape (N, V, 2)
        """

        return _get_geno(self.pgen, indices)

    def get_lanc_geno(self, indices: NDArray[np.unsignedinteger]) -> NDArray[np.int32]:
        """Query genotypes deconvoluted/masked by ancestry.

        Args:
            indices: An array of variant indices in pvar order (0-based), shape (V,)

        Returns:
            An array of genotypes masked by ancestry, shape (N, V, 2)
        """
        geno = np.asarray(self.get_geno(indices), dtype=np.int32)
        lanc = np.asarray(self.get_lanc(indices), dtype=np.uint8)
        ancestries = np.arange(len(self.ancestries), dtype=np.uint8)
        left_haps_mask = (lanc[:, :, 0:1] == ancestries[None, None, :]).astype(np.int32)
        right_haps_mask = (lanc[:, :, 1:2] == ancestries[None, None, :]).astype(
            np.int32
        )
        geno_masked = (
            left_haps_mask * geno[:, :, 0:1] + right_haps_mask * geno[:, :, 1:2]
        )
        return geno_masked

__init__(plink_prefix, lanc_file, ancestries=None)

Constructs a LancData from plink2 files.

Parameters:

Name Type Description Default
plink_prefix str

The prefix for a plink2 fileset.

required
lanc_file str

The path to a .lanc file.

required
ancestries Optional[list[str]

An optional list of ordered ancestry names corresponding to the .lanc file.

None
Source code in src/lanctools/core.py
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
def __init__(
    self,
    plink_prefix: str,
    lanc_file: str,
    ancestries: Optional[list[str]] = None,
):
    """Constructs a LancData from plink2 files.

    Args:
        plink_prefix (str): The prefix for a plink2 fileset.
        lanc_file (str): The path to a .lanc file.
        ancestries (Optional[list[str]): An optional list of ordered ancestry names corresponding to the .lanc file.
    """
    pgen = PgenReader(bytes(plink_prefix + ".pgen", "utf8"))
    pvar = PvarReader(bytes(plink_prefix + ".pvar", "utf8"))
    lanc = _read_lanc(lanc_file)

    if ancestries is None:
        all_values = np.concatenate([lanc.left_haps, lanc.right_haps])
        ancestries = [str(i) for i in np.unique(all_values)]

    self.pgen = pgen
    self.pvar = pvar
    self.lanc = lanc
    self.ancestries = ancestries
    self.plink_prefix = plink_prefix

get_geno(indices)

Query phased genotypes.

Parameters:

Name Type Description Default
indices NDArray[uint32]

An array of variant indices in pvar order (0-based), shape (V,)

required

Returns:

Type Description
NDArray[int32]

An array of phased genotypes, shape (N, V, 2)

Source code in src/lanctools/core.py
370
371
372
373
374
375
376
377
378
379
380
def get_geno(self, indices: NDArray[np.uint32]) -> NDArray[np.int32]:
    """Query phased genotypes.

    Args:
        indices: An array of variant indices in pvar order (0-based), shape (V,)

    Returns:
        An array of phased genotypes, shape (N, V, 2)
    """

    return _get_geno(self.pgen, indices)

get_info(indices)

Query info for a set of variants.

Parameters:

Name Type Description Default
indices NDArray[uint32]

The variant indices in pvar order (0-based), shape (V,)

required

Returns:

Type Description
DataFrame

pandas.DataFrame: One row per variant with the following columns:

  • CHR (str): Chromosome name.

  • BP (int): 1-based genomic position.

  • REF (str): Reference allele.

  • ALT (str): Alternate allele.

  • ID (str): Variant identifier.

Source code in src/lanctools/core.py
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
def get_info(self, indices: NDArray[np.uint32]) -> DataFrame:
    """Query info for a set of variants.

    Args:
        indices: The variant indices in pvar order (0-based), shape (V,)

    Returns:
        pandas.DataFrame: One row per variant with the following columns:

            - CHR (str): Chromosome name. \n
            - BP (int): 1-based genomic position. \n
            - REF (str): Reference allele. \n
            - ALT (str): Alternate allele. \n
            - ID (str): Variant identifier. \n
    """

    return _get_info(self.pvar, indices)

get_lanc(indices)

Query phased local ancestry.

Parameters:

Name Type Description Default
indices NDArray[unsignedinteger]

The variant indices in pvar order (0-based), shape (V,)

required

Returns:

Type Description
NDArray[uint8]

An array of ancestries, shape (N, V, 2)

Source code in src/lanctools/core.py
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
def get_lanc(self, indices: NDArray[np.unsignedinteger]) -> NDArray[np.uint8]:
    """Query phased local ancestry.

    Args:
        indices: The variant indices in pvar order (0-based), shape (V,)

    Returns:
        An array of ancestries, shape (N, V, 2)
    """

    left, right = _get_lanc(
        self.lanc.left_haps,
        self.lanc.right_haps,
        self.lanc.breakpoints,
        self.lanc.offsets,
        indices,
    )
    return np.stack((left, right), axis=-1)

get_lanc_dosage(indices)

Query local ancestry dosage.

Parameters:

Name Type Description Default
indices NDArray[uint32]

An array of variant indices in pvar order (0-based), shape (V,)

required

Returns:

Type Description
NDArray[uint8]

An array of local ancestry dosages, shape (N, V, K) (where K is the number of ancestries)

Source code in src/lanctools/core.py
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
def get_lanc_dosage(self, indices: NDArray[np.uint32]) -> NDArray[np.uint8]:
    """Query local ancestry dosage.

    Args:
        indices: An array of variant indices in pvar order (0-based), shape (V,)

    Returns:
        An array of local ancestry dosages, shape (N, V, K) (where K is the
            number of ancestries)
    """

    lanc = np.asarray(self.get_lanc(indices), dtype=np.uint8)
    ancestries = np.arange(len(self.ancestries), dtype=np.uint8)
    left_haps_mask = (lanc[:, :, 0:1] == ancestries[None, None, :]).astype(np.int32)
    right_haps_mask = (lanc[:, :, 1:2] == ancestries[None, None, :]).astype(
        np.int32
    )
    return left_haps_mask + right_haps_mask

get_lanc_geno(indices)

Query genotypes deconvoluted/masked by ancestry.

Parameters:

Name Type Description Default
indices NDArray[unsignedinteger]

An array of variant indices in pvar order (0-based), shape (V,)

required

Returns:

Type Description
NDArray[int32]

An array of genotypes masked by ancestry, shape (N, V, 2)

Source code in src/lanctools/core.py
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
def get_lanc_geno(self, indices: NDArray[np.unsignedinteger]) -> NDArray[np.int32]:
    """Query genotypes deconvoluted/masked by ancestry.

    Args:
        indices: An array of variant indices in pvar order (0-based), shape (V,)

    Returns:
        An array of genotypes masked by ancestry, shape (N, V, 2)
    """
    geno = np.asarray(self.get_geno(indices), dtype=np.int32)
    lanc = np.asarray(self.get_lanc(indices), dtype=np.uint8)
    ancestries = np.arange(len(self.ancestries), dtype=np.uint8)
    left_haps_mask = (lanc[:, :, 0:1] == ancestries[None, None, :]).astype(np.int32)
    right_haps_mask = (lanc[:, :, 1:2] == ancestries[None, None, :]).astype(
        np.int32
    )
    geno_masked = (
        left_haps_mask * geno[:, :, 0:1] + right_haps_mask * geno[:, :, 1:2]
    )
    return geno_masked

lanctools.FlatLanc

Stores .lanc file ancestry data in a flattened structure for fast querying.

Attributes:

Name Type Description
right_haps NDArray[uint8]

Concatenated right haplotypes for all samples, shape (H,)

left_haps NDArray[uint8]

Concatenated left haplotypes for all samples, shape (H,)

breakpoints NDArray[uint32]

Concatenated breakpoints for all samples, shape (H,)

offsets NDArray[uint32]

Cumulative end indices separating samples, shape (N,)

Source code in src/lanctools/core.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
class FlatLanc:
    """Stores .lanc file ancestry data in a flattened structure for fast querying.

    Attributes:
        right_haps (NDArray[uint8]): Concatenated right haplotypes for all samples, shape (H,)
        left_haps (NDArray[uint8]): Concatenated left haplotypes for all samples, shape (H,)
        breakpoints (NDArray[uint32]): Concatenated breakpoints for all samples, shape (H,)
        offsets (NDArray[uint32]): Cumulative end indices separating samples, shape (N,)
    """

    def __init__(
        self,
        left_haps: NDArray[np.uint8],
        right_haps: NDArray[np.uint8],
        breakpoints: NDArray[np.uint32],
        offsets: NDArray[np.uint32],
    ):
        self.left_haps = left_haps
        self.right_haps = right_haps
        self.breakpoints = breakpoints
        self.offsets = offsets