Source code for lda_over_time.models.temporal_lda_model

"""
TemporalLdaModel is a simpler and faster temporal LDA that returns the \
proportion of main topics in each time slice.

Its main advantage over other models is that it is fast. But it may not handle \
well the variation of the way that a topic is presented (when vocabulary to \
describe the topic varies over the given dataset).
"""
# IMPORTS
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from lda_over_time.models.dtm_model_interface import DtmModelInterface
from multiprocessing import cpu_count
from pyLDAvis.gensim_models import _extract_data as extract_data

import pandas as pd


# TYPING
from typing import List, Optional


[docs]class TemporalLdaModel(DtmModelInterface):
    """
    TemporalLdaModel is a simple temporal LDA model, it is faster, but \
    it may not handle well the evolution of topics (because the vocabulary \
    used in a certain topic may vary over time).

    :param corpus:Each item from the list is one document from corpus.
    :type corpus: list[str]

    :param dates: List of timestamps for each document in corpus, each date's \
    position should match with its respective text.
    :type dates: list[str]

    :param date_format: The date format used in `dates`, e.g. "%Y/%m/%d" for \
    "YYYY/MM/DD" format. More info at `documentation`_.
    :type date_format: str

    :param freq: The frequency used to group texts, e.g. "1M15D" for a \
    frequency of a month and 15 days. Useful notations: \
    day = "D"\
    month = "M"; \
    year = "Y". \
    More info at `pandas`_
    :type freq: str

    :param n_topics: Number of topics that the DTM model should find. The \
    default value is 100.
    :type n_topics: int, optional

    :param sep: Separator used to split each word, the default value is any \
    blank space.
    :type sep: str, optional

    :param workers: Number of workers (cpus) to use. If not provided, it will \
    use the total number of threads on running machine.
    :type workers: int, optional

    :param aggregator: Specifies how to aggregate all documents in time slice \
    and calculate its proportions. It can be either `average` to calculate the \
    average of topic's weights for each time slice or the proportion of `main` \
    topics in each time slice. Default is `average`.
    :type aggregator: str, optional

    :return: Nothing
    :rtype: None

    .. _documentation: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
    .. _pandas: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases

    """

    def __init__(self,
                 corpus: List[str],
                 dates: List[str],
                 date_format: str,
                 freq: str,
                 n_topics: int = 100,
                 sep: Optional[str] = None,
                 workers: Optional[int] = None,
                 aggregator: str = 'average'):
        """
        I initialize the variables to train model.
        """
        # Save arguments inside object
        self.corpus = corpus
        self.dates = dates
        self.date_format = date_format
        self.freq = freq
        self.n_topics = n_topics
        self.sep = sep
        self.aggregator = aggregator

        # get number of parallel workers
        self.workers = workers if isinstance(workers, int) else cpu_count()


    def __normalize_lda_model(self, corpus, lda_model):
        """
        This method creates the dataframe from the result of lda training and \
        normalizes the weights (sum of weights for a topic = 1.0)

        :param corpus: List of Gensim's BoWs, where each BoW represents one \
        text from corpus.
        :type corpus: list[BoW]

        :param lda_model: Trained lda model
        :type lda_model: gensim.models.ldamulticore.LdaMulticore

        :return: Table of normalized results of Lda model.
        :rtype: pd.core.frame.DataFrame
        """
        # Create stream of weights for each document in corpus
        stream = lda_model.get_document_topics(corpus)

        # Get weights
        lda_weights = pd.DataFrame(map(dict, stream))

        # Normalize weights
        return (lda_weights.T / lda_weights.sum(axis=1)).T


    def __prepare_corpus(self, corpus, sep):
        """
        This method takes a list of document's string and convert them to \
        correct object typing asked by Gensim's LdaMulticore.

        :param corpus: List of documents' texts
        :type corpus: list[str]

        :param sep: Character that separates each word. If not given, all \
        whitespaces are considered separators.
        :type sep: str, optional

        :return: Pair of Gensim's Dictionary instance and list of list of \
        Gensim's BoW from corpus
        :rtype: tuple[gensim.corpora.Dictionary, list[BoW]]
        """
        # Split words for each document
        data = [doc.split(sep) for doc in corpus]

        # Train Dictionary
        dictionary = Dictionary(data)

        # Convert corpus to list of BoWs
        corpus_bow = [dictionary.doc2bow(doc) for doc in data]

        # Return dictionary and list of BoWs
        return dictionary, corpus_bow


    def __extract_main_topics(self,
                              dates,
                              date_format,
                              normalized_model,
                              n_topics):
        """
        This method prepares the model to make a prevalence analysis over the \
        lda model. For this, it will create a table with two columns: one for \
        the date and other for the main topic of each document.

        :param dates: List of timestamps for each document in corpus, each \
        date's position should match with its respective text.
        :type dates: list[str]

        :param date_format:The date format used in `dates`.
        :type date_format: str

        :param normalized_model: Normalized result from lda model over corpus.
        :type: pd.core.frame.DataFrame

        :return: Table with date and main topic columns and each row \
        represents a document from corpus.
        :rtype: pd.core.frame.DataFrame
        """
        # load dates with pandas with given format
        dates = pd.to_datetime(dates, format=date_format)

        # get the main topic for each document
        main_topics = normalized_model[list(range(n_topics))].idxmax(axis=1)

        # join dates to main topics
        zipped = list(zip(dates.values, main_topics))

        # create table with date and topic
        return pd.DataFrame(zipped, columns=['date', 'main_topic'])


    def __prepare_pyldavis(self, dates, date_format, freq, ):
        """
        This method creates the list of periods found in corpus.

        :param dates: List of timestamps for each document in corpus, each date's \
        position should match with its respective text.
        :type dates: list[str]

        :param date_format:The date format used in `dates`.
        :type date_format: str

        :param freq:The frequency used to group texts.
        :type freq: str

        :return: translation of id to date and corpus grouped by timestamp
        """
        # TODO: improve this part...
        # load dates with pandas with given format
        timestamps = pd.to_datetime(dates, format=date_format)

        # create dataframe of dates
        timestamps_df = pd.DataFrame(timestamps, columns=['date'])

        # group dates by timeslice of lenght period
        dates_by_period = timestamps_df.groupby(
                pd.Grouper(key='date', freq=freq),
                sort=True
        )

        # get timestamps used to group
        timestamp_keys = sorted(dates_by_period.groups.keys())

        # initialize variables that hold translation of id to period and
        # group corpus by period
        id_to_period = {}
        corpus_by_period = {}

        # for each period: enumerate it and extract docs that belongs to this period
        for _id, key in enumerate(timestamp_keys):

            # enumerate pediod
            id_to_period[_id + 1] = key

            # get documents that belongs to this period
            indexes = dates_by_period.get_group(key).index

            # track documents of this period
            corpus_by_period[key] = list(map(
                lambda idx: self.bow[idx],
                indexes
            ))

        # return values
        return id_to_period, corpus_by_period


    def __train_lda_model(self, corpus, dictionary, n_topics):
        """
        This method trains the lda model.

        :param corpus: List of Gensim's BoWs, where each BoW represents one \
        text from corpus.
        :type corpus: list[BoW]

        :param dictionary: Dictionary with convertion id <-> word.
        :type dictionary: gensim.corpora.dictionary.Dictionary

        :param n_topics: Number of topics that the DTM model should find. \
        The default value is 100.
        :type n_topics: int, optional

        :return: It returns the trained Gensim's model LdaMulticore.
        :rtype: gensim.models.ldamulticore.LdaMulticore
        """
        return LdaMulticore(corpus=corpus,
                            id2word=dictionary,
                            num_topics=n_topics,
                            random_state=100,
                            chunksize=100,
                            passes=10,
                            minimum_probability=0.0,
                            per_word_topics=True,
                            workers=self.workers)


    def __train_prevalence(self,
                           docs_date_main_topic,
                           freq,
                           grouped_by_period):
        """
        This method takes the result of LDA and calculates the model of \
        temporal LDA with prevalence (frequency of main topics in a period).

        :param docs_date_main_topic: Table of documents with their date and \
        main topic
        :type docs_date_main_topic: pd.core.frame.DataFrame

        :param freq: Frequency to group documents
        :type freq: str

        :param grouped_by_period: Documents grouped by frequency
        :type grouped_by_period: pd.core.groupby.DataFrameGroupBy

        :return: It returns the temporal LDA trained model.
        :rtype: pd.core.frame.DataFrame
        """
        # Calculate number of posts per period
        posts_per_period = grouped_by_period.size().reset_index(name='count')

        # Group by period and topic
        grouped_by_period_topic = docs_date_main_topic.groupby(
            [
                pd.Grouper(key='date', freq=freq),
                pd.Grouper(key='main_topic')
            ],
            sort=True
        )

        # Calculate frequency of each main topic in a time interval period
        frequency = grouped_by_period_topic. \
                        size(). \
                        reset_index(name='count'). \
                        pivot(index='date',
                              columns='main_topic',
                              values='count').reset_index()

        # Replace nan with zero
        frequency = frequency.fillna(0.0)

        # Create table of prevalence
        prevalence = pd.DataFrame(
                (frequency[list(range(self.n_topics))].values.T / \
                                      posts_per_period['count'].values).T
        )

        # Add column with dates
        prevalence['date'] = frequency['date'].values

        # Return temporal lda model
        return prevalence


    @property
    def n_timeslices(self):
        """
        This attribute should be the number of timeslices found during \
        training.

        :return: It should return the number of time slices found in corpus. \
        :rtype: int
        """
        return len(self.id_to_timestamp)


[docs]    def get_results(self):
        """
        This method should return a table representing the evolution of each \
        topic over time.

        :return: Returns a Pandas' dataframe where each column represents a \
        timeslice and must have a `date` and columns representing each \
        topics weight in that period.
        :rtype: pd.core.frame.DataFrame
        """
        return self.temporal_lda_model


[docs]    def get_topic_words(self, topic_id, i, n):
        """
        This method should return the top n words that better describes the \
        topic in a specific time slice.

        :param topic_id: The id of the desired topic.
        :type topic_id: int

        :param i: The position of the desired timeslice in chronological \
        order, the first (oldest) time slice is indexed by 1.
        :type i: int

        :param n:This specifies how many words that better describes the topic \
        at a specific time slice should be returned.
        :type n: int

        :return: It returns a list of top n words that best describes the \
        requested topic in a specific time.
        :rtype: list[str]
        """
        return [
                self.dictionary[word[0]]
                for word in self.lda_model.get_topic_terms(
                        topic_id,
                        n
                )
        ]


[docs]    def prepare_args(self, i):
        """
        This method should return a dictionary with all necessary values to \
        call PyLdaVis.prepare method.

        :param i: The position of the desired timeslice in chronological \
        order, the first (oldest) time slice is indexed by 1.
        :type i: int

        :return: It returns a dictionary ready to be passed to PyLdaVis
        :rtype: dict[str, any]
        """
        # get timestamp
        period = self.id_to_timestamp[i]

        # return prepared data
        return extract_data(self.lda_model,
                            self.timestamp_to_corpus[period],
                            self.dictionary)


[docs]    def train(self):
        """
        This method trains the dtm model.

        :return: Nothing.
        :rtype: None
        """
        # Get dictionary and convert corpus to correct type
        self.dictionary, self.bow = self.__prepare_corpus(self.corpus,
                                                          self.sep)

        # train lda model
        self.lda_model = self.__train_lda_model(self.bow,
                                                self.dictionary,
                                                self.n_topics)

        # get normalized results of lda model from corpus
        self.normalized = self.__normalize_lda_model(self.bow,
                                                     self.lda_model)

        # aggregate is `average`: calculate average
        if self.aggregator == 'average':

            # add dates to normalized data
            self.normalized['date'] = pd.to_datetime(
                    self.dates,
                    format=self.date_format
            )

            # group by frequency and calculate average
            self.temporal_lda_model = self.normalized.groupby(
                    [pd.Grouper(key='date', freq=self.freq)],
                    sort=True
            ).mean().reset_index()

        # aggregator is `main`: calculate proportion of main topics
        elif self.aggregator == 'main':

            # prepare model to prevalence analysis
            self.docs_date_main_topic = \
                    self.__extract_main_topics(self.dates,
                                               self.date_format,
                                               self.normalized,
                                               self.n_topics)
            # group documents by date
            self.grouped_by_frequency = \
                    self.docs_date_main_topic.groupby(pd.Grouper(key='date',
                                                                 freq=self.freq),
                                                      sort=True)

            # calculate temporal lda model
            self.temporal_lda_model = \
                    self.__train_prevalence(self.docs_date_main_topic,
                                            self.freq,
                                            self.grouped_by_frequency)

        # unknown option: raise error
        else:
            raise ValueError(
                    f"Option \"{self.aggregator}\" does not exist. "
                    "Valid options are 'average' and 'main'"
            )

        # get the list of timestamps
        self.id_to_timestamp, self.timestamp_to_corpus = \
                self.__prepare_pyldavis(self.dates, self.date_format, self.freq)