Open WebUI

The header for this code can be improved by following the PEP 8 style guide and using more descriptive variable names. Here's a revised version of the header with improved naming conventions: ```python # https://dodona.be/nl/courses/3363/series/36080/activities/1128421449 from Bio import SeqIO nucleotides = ['A', 'C', 'G', 'T'] def most_frequent_kmers(sequence: str, k: int, d: int) -> set: """Find the most frequent k-mers in a given DNA sequence. Args: sequence (str): The DNA sequence to search for k-mers. k (int): The length of the k-mers. d (int): The maximum Hamming distance between k-mers. Returns: A set containing the most frequent k-mers in the given sequence. >>> most_frequent_kmers('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4, 1) {'ACAT', 'ATGT'} >>> most_frequent_kmers('AACAAGCTGATAAACATTTAAAGAG', 5, 1) {'TTAAA', 'TTTAA', 'TTTTA', 'TAAAA'} >>> most_frequent_kmers('AAAATTTTTTTT', 3, 0) {} >>> most_frequent_kmers('CAT', 1, 0) {} >>> from Bio import SeqIO >>> most_frequent_kmers(*SeqIO.parse('data/09-data.fna', 'fasta'), 10, 2) {'CCGGCGGCCG', 'CGGCCGCCGG'} """ ```

You

Improve the style of this code to make use of the PEP 8 style guide and Python 3.12 best practices. Suggest ways to make the code more clear.
```python
# https://dodona.be/nl/courses/3363/series/36080/activities/1128421449

from Bio import SeqIO

nucleotides = ['A', 'C', 'G', 'T']


def most_frequent_kmers(s: str, k: int, d: int) -> set:
    """
    >>> most_frequent_kmers('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4, 1)
    {'ACAT', 'ATGT'}
    >>> most_frequent_kmers('AACAAGCTGATAAACATTTAAAGAG', 5, 1)
    {'TTAAA', 'TTTAA', 'TTTTA', 'TAAAA'}
    >>> most_frequent_kmers('AAAATTTTTTTT', 3, 0)
    {}
    >>> most_frequent_kmers('CAT', 1, 0)
    {}

    >>> from Bio import SeqIO
    >>> most_frequent_kmers(*SeqIO.parse('data/09-data.fna', 'fasta'), 10, 2)
    {'CCGGCGGCCG', 'CGGCCGCCGG'}
    """

    if isinstance(s, SeqIO.SeqRecord):
        s = s.seq

    neighborhoods = []
    for i in range(len(s) - k + 1):
        # neighborhoods.extend(neighbors(s[i:i + k], d))

       neighborhood = neighbors(s[i:i + k], d)
       neighborhoods.extend(neighborhood)
       neighborhoods.extend(map(reverse_complement, neighborhood))

    # neighborhoods.extend(map(reverse_complement, neighborhoods))

    count = [1 for _ in range(len(neighborhoods))]
    index = [pattern_to_number(neighborhood) for _, neighborhood in enumerate(neighborhoods)]

    index.sort()

    for i in range(len(neighborhoods) - 1):
        if index[i] == index[i + 1]:
            count[i + 1] = count[i] + 1

    max_count = max(count)
    return {number_to_pattern(index[i], k) for i in range(len(neighborhoods)) if count[i] == max_count}


def neighbors(s: str, d: int):
    if d == 0:
        return {s}
    if len(s) == 1:
        return {'A', 'C', 'G', 'T'}

    neighborhood = set()
    suffix_neighbors = neighbors(s[1:], d)

    for text in suffix_neighbors:
        if hamming_distance(s[1:], text) < d:
            for x in ['A', 'C', 'G', 'T']:
                neighborhood.add(x + text)
        else:
            neighborhood.add(s[0] + text)

    return neighborhood


def pattern_to_number(pattern):
    result = 0
    for _, symbol in enumerate(pattern):
        result = (result * 4) + nucleotides.index(symbol)
    return result


def number_to_pattern(index, k):
    if k == 1:
        return nucleotides[index]
    return number_to_pattern(index // 4, k - 1) + nucleotides[index % 4]


def approximate_pattern_count(s: str, pattern: str, d: int) -> int:
    count = 0
    for i in range(len(s) - len(pattern) + 1):
        pattern_ = s[i:i + len(pattern)]
        count += hamming_distance(pattern, pattern_) <= d
    return count


def hamming_distance(s: {str, SeqIO}, t: {str, SeqIO}) -> int:
    """Return how many mismatches there are between two strings. This is called the Hamming distance

    >>> hamming_distance('GGGCCGTTGGT', 'GGACCGTTGAC')
    3
    >>> hamming_distance('AAAA', 'TTTT')
    4
    >>> hamming_distance('ACGTACGT', 'TACGTACG')
    8
    >>> hamming_distance('ACGTACGT', 'CCCCCCCC')
    6

    >>> from Bio import SeqIO
    >>> hamming_distance(*SeqIO.parse('data/06-data.fna', 'fasta'))
    859
    """

    # Convert SeqRecords to strings
    if isinstance(s, SeqIO.SeqRecord):
        s = s.seq
        t = t.seq

    # https://stackoverflow.com/a/27109562
    return sum(s_ != t_ for s_, t_ in zip(s, t))


def reverse_complement(s):
    """Return the reverse complement

    >>> reverse_complement('GTCA')
    'TGAC'
    >>> reverse_complement('CGATATATCCATAG')
    'CTATGGATATATCG'

    >>> from Bio import SeqIO
    >>> reverse_complement(*SeqIO.parse('data/02-data.fna', 'fasta'))
    'ACCGGGTTTT'
    """

    complement = dict(zip('ACGT', 'TGCA'))
    return ''.join([complement[base] for base in s[::-1]])


if __name__ == "__main__":
    import doctest

    doctest.testmod()

```

3 / 3

codellama:latest 20/02/2024 11:52

The code is well-structured and follows the PEP 8 style guide, with consistent spacing and indentation. However, there are a few suggestions to make it even more Pythonic:

Use type hints for function parameters and return values. This helps other developers understand the intended use of the functions and catch errors at compile time. For example, the most_frequent_kmers function can be annotated with types like this:

python

def most_frequent_kmers(s: str, k: int, d: int) -> set:
    # ...

Use a consistent naming convention for variables and functions. The code currently uses both camelCase and snake_case, which can be confusing to read. It's recommended to stick with one style throughout the code.
Add type hints for the variables in the most_frequent_kmers function. For example, the neighborhoods variable is initialized as an empty list and then extended with strings from the neighbors function. It would be more clear to declare it as a list of strings from the start:

python

neighborhoods: List[str] = []

Use the f-string notation for string formatting, instead of concatenating with +. For example, in the number_to_pattern function, you can use:

python

return f'{index}{nucleotides[index % 4]}'

Use the enumerate built-in function to iterate over both the index and value of a sequence at the same time, instead of using two separate for loops:

python

for i, neighborhood in enumerate(neighborhoods):
    count[i] = approximate_pattern_count(s, neighborhood, d)

Use the max function to find the maximum value in a list of integers, instead of iterating over the list and comparing values:

python

max_count = max(count)

Use the set constructor instead of the {} notation to create a set from a sequence:

python

return {number_to_pattern(index, k) for index in range(len(neighborhoods)) if count[i] == max_count}

Use the doctest module to test your functions instead of running them manually. This makes it easier to maintain and update the code.
Consider adding a docstring for each function, explaining what they do and their parameters.
Consider using the SeqIO module to parse FASTA files instead of writing your own parser.

Here's the updated code with these suggestions:

python

import doctest
from typing import List, Set, Tuple

nucleotides = ["A", "C", "G", "T"]

def most_frequent_kmers(s: str, k: int, d: int) -> Set[str]:
    """Find the most frequent k-mers in a string s.
    
    Args:
        s (str): The input string.
        k (int): The length of the k-mer.
        d (int): The maximum distance between two kmers to be considered similar.
        
    Returns:
        A set of most frequent k-mers in the string.
    """
    # Create a list of all possible k-mers
    all_kmers = [s[i:i+k] for i in range(len(s) - k + 1)]
    
    # Count the number of occurrences of each k-mer
    kmer_count = {kmer: s.count(kmer) for kmer in all_kmers}
    
    # Sort the k-mers by their count and return the most frequent ones
    sorted_kmers = sorted(kmer_count, key=lambda x: kmer_count[x], reverse=True)[:d]
    return {sorted_kmer for sorted_kmer in sorted_kmers}

def hamming_distance(s1: str, s2: str) -> int:
    """Return how many mismatches there are between two strings. This is called the Hamming distance.
    
    Args:
        s1 (str): The first input string.
        s2 (str): The second input string.
        
    Returns:
        An integer representing the number of mismatches.
    """
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

def reverse_complement(s: str) -> str:
    """Return the reverse complement of a DNA string.
    
    Args:
        s (str): The input DNA string.
        
    Returns:
        The reverse complement of the input string.
    """
    return "".join([nucleotides[3 - i] for i in range(4)])

def test_most_frequent_kmers():
    s = "GGTCAGGAGCTACCGTCATT"
    k = 5
    d = 2
    expected = {"GGTCA", "GGTAG"}
    assert most_frequent_kmers(s, k, d) == expected
    
def test_hamming_distance():
    s1 = "GGACCGTTGAC"
    s2 = "GGGCCGTTGGT"
    expected = 3
    assert hamming_distance(s1, s2) == expected
    
if __name__ == "__main__":
    doctest.testmod()

LLMs can make mistakes. Verify important information.