Empty strings from an .mpp file

I’m currently working with a Microsoft Project (.mpp) file and encountering an issue where the file isn’t populating any data—I’m only getting empty strings for specific columns like Milestone and Resource Names. Specifically, I’m trying to parse the .mpp file to extract derived columns and custom fields, but nothing seems to be working.

Has anyone else experienced this problem?
If so, do you have any advice on how to correctly parse an .mpp file to retrieve the necessary data? Any tips or tricks that could help resolve this issue would be greatly appreciated!

Hi,

I’d guess this problem is probably with your code rather than anything foundry-specific. That being said, please post your code and we can ensure all the transforms api bits are being used correctly.

def compute(uploaded_schedule, tasks_output, projects_output, wbs_output):
spark = SparkSession.builder.getOrCreate()
combined_tasks_df, combined_projects_df = my_transform(uploaded_schedule)

tasks = spark.createDataFrame(combined_tasks_df)
projects = spark.createDataFrame(combined_projects_df)

# Cast timestamp columns
tasks = tasks.withColumn("start", F.col("start").cast(TimestampType()))
tasks = tasks.withColumn("finish", F.col("finish").cast(TimestampType()))
tasks = tasks.withColumn("early_start", F.col("early_start").cast(TimestampType()))
tasks = tasks.withColumn("early_finish", F.col("early_finish").cast(TimestampType()))
tasks = tasks.withColumn("commitment_start", F.col("commitment_start").cast(TimestampType()))
tasks = tasks.withColumn("commitment_finish", F.col("commitment_finish").cast(TimestampType()))

# Cast Outline Level and Percent Complete to Integer
tasks = tasks.withColumn("outline_level", F.col("outline_level"))
tasks = tasks.withColumn("percent_complete", F.col("percent_complete"))  # Cast to integer
tasks = tasks.withColumn("Milestones", F.col("Milestones")) # Ensure boolean type 
tasks = tasks.withColumn("actual_work", F.col("actual_work"))
tasks = tasks.withColumn("resource_names", F.col("resource_names"))

tasks.printSchema()

tasks = tasks.withColumn(
    "wbs_fk", F.concat_ws("_", F.col("Text10"), F.col("wbs"))
)
tasks = tasks.withColumn(
    "duration", F.unix_millis(F.col("finish")) - F.unix_millis(F.col("start"))
)
tasks = tasks.withColumn(
    "task_id", F.concat_ws("_", F.col("Text10"), F.col("id"))
)
tasks = tasks.withColumn("is_milestone", F.lit(False))
tasks = tasks.withColumn("milestone_type", F.lit("Standard"))
tasks = tasks.filter(~(F.col("id") == "")).dropna(subset=["id"])

tasks = tasks.withColumn(
    "is_complete", F.when(tasks.percent_complete == 100, True).otherwise(False)
)

SECONDS_IN_MONTH = 30.44 * 24 * 60 * 60
tasks = tasks.withColumn(
    "time_diff",
    F.unix_timestamp(tasks.early_finish) - F.unix_timestamp(tasks.finish),
)
tasks = tasks.withColumn(
    "status",
    F.when(F.col("time_diff") > SECONDS_IN_MONTH, "behind").otherwise(
        F.when(F.col("time_diff") < 0, "ahead").otherwise("on_time")
    ),
)

wbs_df = tasks.select('wbs', 'Text10', 'wbs_fk', 'outline_level').drop_duplicates()

projects = projects.select(
    F.col("filename"),
    F.col("file_uploaded_at").cast(TimestampType()),
)

tasks_output.write_dataframe(tasks)
projects_output.write_dataframe(projects)
wbs_output.write_dataframe(wbs_df)

This topic was automatically closed 60 days after the last reply. New replies are no longer allowed.